From 6e7cc8a3f88ceec9c29e8b535a5f9c0c25f0b1ea Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 2 Nov 2022 14:55:21 +0000 Subject: [PATCH] Import highway_1.0.2.orig.tar.gz [dgit import orig highway_1.0.2.orig.tar.gz] --- .github/workflows/build_test.yml | 57 + BUILD | 413 ++ CMakeLists.txt | 580 ++ CMakeLists.txt.in | 15 + CONTRIBUTING | 33 + LICENSE | 201 + README.md | 322 + WORKSPACE | 24 + cmake/FindAtomics.cmake | 56 + debian/changelog | 157 + debian/compat | 1 + debian/control | 23 + debian/copyright | 20 + debian/rules | 6 + debian/source/format | 1 + g3doc/design_philosophy.md | 186 + g3doc/faq.md | 310 + g3doc/highway_intro.pdf | Bin 0 -> 1313237 bytes g3doc/impl_details.md | 221 + g3doc/instruction_matrix.pdf | Bin 0 -> 155399 bytes g3doc/quick_reference.md | 1511 +++++ g3doc/release_testing_process.md | 37 + hwy.gni | 53 + hwy/aligned_allocator.cc | 152 + hwy/aligned_allocator.h | 212 + hwy/aligned_allocator_test.cc | 278 + hwy/base.h | 946 +++ hwy/base_test.cc | 178 + hwy/cache_control.h | 110 + hwy/contrib/algo/copy-inl.h | 136 + hwy/contrib/algo/copy_test.cc | 199 + hwy/contrib/algo/find-inl.h | 109 + hwy/contrib/algo/find_test.cc | 219 + hwy/contrib/algo/transform-inl.h | 262 + hwy/contrib/algo/transform_test.cc | 372 ++ hwy/contrib/dot/dot-inl.h | 252 + hwy/contrib/dot/dot_test.cc | 167 + hwy/contrib/image/image.cc | 145 + hwy/contrib/image/image.h | 471 ++ hwy/contrib/image/image_test.cc | 152 + hwy/contrib/math/math-inl.h | 1242 ++++ hwy/contrib/math/math_test.cc | 227 + hwy/contrib/sort/BUILD | 190 + hwy/contrib/sort/README.md | 87 + hwy/contrib/sort/algo-inl.h | 512 ++ hwy/contrib/sort/bench_parallel.cc | 238 + hwy/contrib/sort/bench_sort.cc | 310 + hwy/contrib/sort/print_network.cc | 191 + hwy/contrib/sort/result-inl.h | 139 + hwy/contrib/sort/shared-inl.h | 133 + hwy/contrib/sort/sort_test.cc | 626 ++ hwy/contrib/sort/sorting_networks-inl.h | 695 +++ hwy/contrib/sort/traits-inl.h | 527 ++ hwy/contrib/sort/traits128-inl.h | 492 ++ hwy/contrib/sort/vqsort-inl.h | 1443 +++++ hwy/contrib/sort/vqsort.cc | 184 + hwy/contrib/sort/vqsort.h | 108 + hwy/contrib/sort/vqsort_128a.cc | 62 + hwy/contrib/sort/vqsort_128d.cc | 62 + hwy/contrib/sort/vqsort_f32a.cc | 53 + hwy/contrib/sort/vqsort_f32d.cc | 54 + hwy/contrib/sort/vqsort_f64a.cc | 61 + hwy/contrib/sort/vqsort_f64d.cc | 61 + hwy/contrib/sort/vqsort_i16a.cc | 54 + hwy/contrib/sort/vqsort_i16d.cc | 54 + hwy/contrib/sort/vqsort_i32a.cc | 54 + hwy/contrib/sort/vqsort_i32d.cc | 54 + hwy/contrib/sort/vqsort_i64a.cc | 54 + hwy/contrib/sort/vqsort_i64d.cc | 54 + hwy/contrib/sort/vqsort_kv128a.cc | 65 + hwy/contrib/sort/vqsort_kv128d.cc | 65 + hwy/contrib/sort/vqsort_kv64a.cc | 65 + hwy/contrib/sort/vqsort_kv64d.cc | 65 + hwy/contrib/sort/vqsort_u16a.cc | 54 + hwy/contrib/sort/vqsort_u16d.cc | 55 + hwy/contrib/sort/vqsort_u32a.cc | 54 + hwy/contrib/sort/vqsort_u32d.cc | 55 + hwy/contrib/sort/vqsort_u64a.cc | 54 + hwy/contrib/sort/vqsort_u64d.cc | 55 + hwy/detect_compiler_arch.h | 234 + hwy/detect_targets.h | 478 ++ hwy/examples/benchmark.cc | 254 + hwy/examples/skeleton-inl.h | 66 + hwy/examples/skeleton.cc | 121 + hwy/examples/skeleton.h | 36 + hwy/examples/skeleton_test.cc | 110 + hwy/foreach_target.h | 261 + hwy/highway.h | 378 ++ hwy/highway_export.h | 74 + hwy/highway_test.cc | 485 ++ hwy/hwy.version | 19 + hwy/nanobenchmark.cc | 762 +++ hwy/nanobenchmark.h | 194 + hwy/nanobenchmark_test.cc | 94 + hwy/ops/arm_neon-inl.h | 6664 ++++++++++++++++++++ hwy/ops/arm_sve-inl.h | 3151 ++++++++++ hwy/ops/emu128-inl.h | 2511 ++++++++ hwy/ops/generic_ops-inl.h | 1357 ++++ hwy/ops/rvv-inl.h | 3405 +++++++++++ hwy/ops/scalar-inl.h | 1571 +++++ hwy/ops/set_macros-inl.h | 444 ++ hwy/ops/shared-inl.h | 311 + hwy/ops/wasm_128-inl.h | 4589 ++++++++++++++ hwy/ops/wasm_256-inl.h | 3060 +++++++++ hwy/ops/x86_128-inl.h | 7485 +++++++++++++++++++++++ hwy/ops/x86_256-inl.h | 5619 +++++++++++++++++ hwy/ops/x86_512-inl.h | 4412 +++++++++++++ hwy/per_target.cc | 50 + hwy/per_target.h | 37 + hwy/print-inl.h | 55 + hwy/print.cc | 110 + hwy/print.h | 73 + hwy/targets.cc | 434 ++ hwy/targets.h | 318 + hwy/targets_test.cc | 135 + hwy/tests/arithmetic_test.cc | 445 ++ hwy/tests/blockwise_shift_test.cc | 268 + hwy/tests/blockwise_test.cc | 452 ++ hwy/tests/combine_test.cc | 273 + hwy/tests/compare_test.cc | 509 ++ hwy/tests/compress_test.cc | 757 +++ hwy/tests/convert_test.cc | 643 ++ hwy/tests/crypto_test.cc | 553 ++ hwy/tests/demote_test.cc | 326 + hwy/tests/float_test.cc | 349 ++ hwy/tests/hwy_gtest.h | 157 + hwy/tests/if_test.cc | 175 + hwy/tests/interleaved_test.cc | 256 + hwy/tests/list_targets.cc | 71 + hwy/tests/logical_test.cc | 270 + hwy/tests/mask_mem_test.cc | 197 + hwy/tests/mask_test.cc | 293 + hwy/tests/memory_test.cc | 341 ++ hwy/tests/mul_test.cc | 446 ++ hwy/tests/reduction_test.cc | 227 + hwy/tests/reverse_test.cc | 176 + hwy/tests/shift_test.cc | 428 ++ hwy/tests/swizzle_test.cc | 272 + hwy/tests/test_util-inl.h | 665 ++ hwy/tests/test_util.cc | 117 + hwy/tests/test_util.h | 172 + hwy/tests/test_util_test.cc | 105 + libhwy-contrib.pc.in | 10 + libhwy-test.pc.in | 11 + libhwy.pc.in | 10 + preamble.js.lds | 9 + run_tests.bat | 20 + run_tests.sh | 80 + 148 files changed, 75635 insertions(+) create mode 100644 .github/workflows/build_test.yml create mode 100644 BUILD create mode 100644 CMakeLists.txt create mode 100644 CMakeLists.txt.in create mode 100644 CONTRIBUTING create mode 100644 LICENSE create mode 100644 README.md create mode 100644 WORKSPACE create mode 100644 cmake/FindAtomics.cmake create mode 100644 debian/changelog create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/copyright create mode 100755 debian/rules create mode 100644 debian/source/format create mode 100644 g3doc/design_philosophy.md create mode 100644 g3doc/faq.md create mode 100644 g3doc/highway_intro.pdf create mode 100644 g3doc/impl_details.md create mode 100644 g3doc/instruction_matrix.pdf create mode 100644 g3doc/quick_reference.md create mode 100644 g3doc/release_testing_process.md create mode 100644 hwy.gni create mode 100644 hwy/aligned_allocator.cc create mode 100644 hwy/aligned_allocator.h create mode 100644 hwy/aligned_allocator_test.cc create mode 100644 hwy/base.h create mode 100644 hwy/base_test.cc create mode 100644 hwy/cache_control.h create mode 100644 hwy/contrib/algo/copy-inl.h create mode 100644 hwy/contrib/algo/copy_test.cc create mode 100644 hwy/contrib/algo/find-inl.h create mode 100644 hwy/contrib/algo/find_test.cc create mode 100644 hwy/contrib/algo/transform-inl.h create mode 100644 hwy/contrib/algo/transform_test.cc create mode 100644 hwy/contrib/dot/dot-inl.h create mode 100644 hwy/contrib/dot/dot_test.cc create mode 100644 hwy/contrib/image/image.cc create mode 100644 hwy/contrib/image/image.h create mode 100644 hwy/contrib/image/image_test.cc create mode 100644 hwy/contrib/math/math-inl.h create mode 100644 hwy/contrib/math/math_test.cc create mode 100644 hwy/contrib/sort/BUILD create mode 100644 hwy/contrib/sort/README.md create mode 100644 hwy/contrib/sort/algo-inl.h create mode 100644 hwy/contrib/sort/bench_parallel.cc create mode 100644 hwy/contrib/sort/bench_sort.cc create mode 100644 hwy/contrib/sort/print_network.cc create mode 100644 hwy/contrib/sort/result-inl.h create mode 100644 hwy/contrib/sort/shared-inl.h create mode 100644 hwy/contrib/sort/sort_test.cc create mode 100644 hwy/contrib/sort/sorting_networks-inl.h create mode 100644 hwy/contrib/sort/traits-inl.h create mode 100644 hwy/contrib/sort/traits128-inl.h create mode 100644 hwy/contrib/sort/vqsort-inl.h create mode 100644 hwy/contrib/sort/vqsort.cc create mode 100644 hwy/contrib/sort/vqsort.h create mode 100644 hwy/contrib/sort/vqsort_128a.cc create mode 100644 hwy/contrib/sort/vqsort_128d.cc create mode 100644 hwy/contrib/sort/vqsort_f32a.cc create mode 100644 hwy/contrib/sort/vqsort_f32d.cc create mode 100644 hwy/contrib/sort/vqsort_f64a.cc create mode 100644 hwy/contrib/sort/vqsort_f64d.cc create mode 100644 hwy/contrib/sort/vqsort_i16a.cc create mode 100644 hwy/contrib/sort/vqsort_i16d.cc create mode 100644 hwy/contrib/sort/vqsort_i32a.cc create mode 100644 hwy/contrib/sort/vqsort_i32d.cc create mode 100644 hwy/contrib/sort/vqsort_i64a.cc create mode 100644 hwy/contrib/sort/vqsort_i64d.cc create mode 100644 hwy/contrib/sort/vqsort_kv128a.cc create mode 100644 hwy/contrib/sort/vqsort_kv128d.cc create mode 100644 hwy/contrib/sort/vqsort_kv64a.cc create mode 100644 hwy/contrib/sort/vqsort_kv64d.cc create mode 100644 hwy/contrib/sort/vqsort_u16a.cc create mode 100644 hwy/contrib/sort/vqsort_u16d.cc create mode 100644 hwy/contrib/sort/vqsort_u32a.cc create mode 100644 hwy/contrib/sort/vqsort_u32d.cc create mode 100644 hwy/contrib/sort/vqsort_u64a.cc create mode 100644 hwy/contrib/sort/vqsort_u64d.cc create mode 100644 hwy/detect_compiler_arch.h create mode 100644 hwy/detect_targets.h create mode 100644 hwy/examples/benchmark.cc create mode 100644 hwy/examples/skeleton-inl.h create mode 100644 hwy/examples/skeleton.cc create mode 100644 hwy/examples/skeleton.h create mode 100644 hwy/examples/skeleton_test.cc create mode 100644 hwy/foreach_target.h create mode 100644 hwy/highway.h create mode 100644 hwy/highway_export.h create mode 100644 hwy/highway_test.cc create mode 100644 hwy/hwy.version create mode 100644 hwy/nanobenchmark.cc create mode 100644 hwy/nanobenchmark.h create mode 100644 hwy/nanobenchmark_test.cc create mode 100644 hwy/ops/arm_neon-inl.h create mode 100644 hwy/ops/arm_sve-inl.h create mode 100644 hwy/ops/emu128-inl.h create mode 100644 hwy/ops/generic_ops-inl.h create mode 100644 hwy/ops/rvv-inl.h create mode 100644 hwy/ops/scalar-inl.h create mode 100644 hwy/ops/set_macros-inl.h create mode 100644 hwy/ops/shared-inl.h create mode 100644 hwy/ops/wasm_128-inl.h create mode 100644 hwy/ops/wasm_256-inl.h create mode 100644 hwy/ops/x86_128-inl.h create mode 100644 hwy/ops/x86_256-inl.h create mode 100644 hwy/ops/x86_512-inl.h create mode 100644 hwy/per_target.cc create mode 100644 hwy/per_target.h create mode 100644 hwy/print-inl.h create mode 100644 hwy/print.cc create mode 100644 hwy/print.h create mode 100644 hwy/targets.cc create mode 100644 hwy/targets.h create mode 100644 hwy/targets_test.cc create mode 100644 hwy/tests/arithmetic_test.cc create mode 100644 hwy/tests/blockwise_shift_test.cc create mode 100644 hwy/tests/blockwise_test.cc create mode 100644 hwy/tests/combine_test.cc create mode 100644 hwy/tests/compare_test.cc create mode 100644 hwy/tests/compress_test.cc create mode 100644 hwy/tests/convert_test.cc create mode 100644 hwy/tests/crypto_test.cc create mode 100644 hwy/tests/demote_test.cc create mode 100644 hwy/tests/float_test.cc create mode 100644 hwy/tests/hwy_gtest.h create mode 100644 hwy/tests/if_test.cc create mode 100644 hwy/tests/interleaved_test.cc create mode 100644 hwy/tests/list_targets.cc create mode 100644 hwy/tests/logical_test.cc create mode 100644 hwy/tests/mask_mem_test.cc create mode 100644 hwy/tests/mask_test.cc create mode 100644 hwy/tests/memory_test.cc create mode 100644 hwy/tests/mul_test.cc create mode 100644 hwy/tests/reduction_test.cc create mode 100644 hwy/tests/reverse_test.cc create mode 100644 hwy/tests/shift_test.cc create mode 100644 hwy/tests/swizzle_test.cc create mode 100644 hwy/tests/test_util-inl.h create mode 100644 hwy/tests/test_util.cc create mode 100644 hwy/tests/test_util.h create mode 100644 hwy/tests/test_util_test.cc create mode 100644 libhwy-contrib.pc.in create mode 100644 libhwy-test.pc.in create mode 100644 libhwy.pc.in create mode 100644 preamble.js.lds create mode 100644 run_tests.bat create mode 100755 run_tests.sh diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml new file mode 100644 index 0000000..bab1630 --- /dev/null +++ b/.github/workflows/build_test.yml @@ -0,0 +1,57 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build / test +on: [push, pull_request] +jobs: + cmake: + name: Build and test ${{ matrix.name }} + runs-on: ubuntu-18.04 + strategy: + matrix: + include: + - name: Clang-5.0 + extra_deps: clang-5.0 + c_compiler: clang-5.0 + cxx_compiler: clang++-5.0 + + - name: Clang-6.0 + extra_deps: clang-6.0 + c_compiler: clang-6.0 + cxx_compiler: clang++-6.0 + + steps: + - uses: actions/checkout@v2 + + - name: Install deps + run: sudo apt-get install ${{ matrix.extra_deps }} + + - name: Build and test + run: | + export CMAKE_BUILD_PARALLEL_LEVEL=2 + export CTEST_PARALLEL_LEVEL=2 + CXXFLAGS=-Werror CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -B out . + cmake --build out + ctest --test-dir out + + bazel: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: bazelbuild/setup-bazelisk@v1 + - uses: actions/cache@v2 + with: + path: ~/.cache/bazel + key: bazel-${{ runner.os }} + - run: bazel build //... diff --git a/BUILD b/BUILD new file mode 100644 index 0000000..1928c32 --- /dev/null +++ b/BUILD @@ -0,0 +1,413 @@ +load("@bazel_skylib//lib:selects.bzl", "selects") + +load("@rules_cc//cc:defs.bzl", "cc_test") +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) + +exports_files(["LICENSE"]) + +# Detect compiler: +config_setting( + name = "compiler_clang", + flag_values = {"@bazel_tools//tools/cpp:compiler": "clang"}, +) + +config_setting( + name = "compiler_clangcl", + flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"}, +) + +config_setting( + name = "compiler_msvc_actual", + flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"}, +) + +# The above is insufficient for Bazel on Windows, which does not seem to +# detect/set a compiler flag. This workaround prevents compile errors due to +# passing clang-only warning flags to MSVC. +config_setting( + name = "compiler_msvc_cpu", + values = { + "cpu": "x64_windows", + }, +) + +selects.config_setting_group( + name = "compiler_msvc", + match_any = [ + ":compiler_msvc_actual", + ":compiler_msvc_cpu", + ], +) + +config_setting( + name = "compiler_emscripten", + values = {"cpu": "wasm32"}, +) + +# See https://github.com/bazelbuild/bazel/issues/12707 +config_setting( + name = "compiler_gcc_bug", + flag_values = { + "@bazel_tools//tools/cpp:compiler": "compiler", + }, +) + +config_setting( + name = "compiler_gcc_actual", + flag_values = { + "@bazel_tools//tools/cpp:compiler": "gcc", + }, +) + +selects.config_setting_group( + name = "compiler_gcc", + match_any = [ + ":compiler_gcc_bug", + ":compiler_gcc_actual", + ], +) + +# Additional warnings for Clang OR GCC (skip for MSVC) +CLANG_GCC_COPTS = [ + "-Wunused-parameter", + "-Wunused-variable", + "-Wextra-semi", + "-Wunreachable-code", +] + +# Warnings supported by Clang and Clang-cl +CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [ + "-Wfloat-overflow-conversion", + "-Wfloat-zero-conversion", + "-Wfor-loop-analysis", + "-Wgnu-redeclared-enum", + "-Winfinite-recursion", + "-Wliteral-conversion", + "-Wno-c++98-compat", + "-Wno-unused-command-line-argument", + "-Wprivate-header", + "-Wself-assign", + "-Wstring-conversion", + "-Wtautological-overlap-compare", + "-Wthread-safety-analysis", + "-Wundefined-func-template", + "-Wunused-comparison", +] + +# Warnings only supported by Clang, but not Clang-cl +CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [ + # Do not treat the third_party headers as system headers when building + # highway - the errors are pertinent. + "--no-system-header-prefix=third_party/highway", +] + +COPTS = select({ + ":compiler_msvc": [], + ":compiler_gcc": CLANG_GCC_COPTS, + ":compiler_clangcl": CLANG_OR_CLANGCL_OPTS, + # Default to clang because compiler detection only works in Bazel + "//conditions:default": CLANG_ONLY_COPTS, +}) + select({ + "@platforms//cpu:riscv64": [ + "-march=rv64gcv1p0", + "-menable-experimental-extensions", + ], + "//conditions:default": [ + ], +}) + +DEFINES = select({ + ":compiler_msvc": ["HWY_SHARED_DEFINE"], + ":compiler_clangcl": ["HWY_SHARED_DEFINE"], + "//conditions:default": [], +}) + +# Unused on Bazel builds, where this is not defined/known; Copybara replaces +# usages with an empty list. +COMPAT = [ + "//buildenv/target:non_prod", # includes mobile/vendor. +] + +# WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without +# failing integration tests, if the machine running tests does not support the +# newly enabled instruction set, or the failure is only caught by sanitizers +# which do not run in CI. + +cc_library( + name = "hwy", + srcs = [ + "hwy/aligned_allocator.cc", + "hwy/per_target.cc", + "hwy/print.cc", + "hwy/targets.cc", + ], + # Normal headers with include guards + hdrs = [ + "hwy/aligned_allocator.h", + "hwy/base.h", + "hwy/cache_control.h", + "hwy/detect_compiler_arch.h", # private + "hwy/print.h", + ], + compatible_with = [], + copts = COPTS, + defines = DEFINES, + local_defines = ["hwy_EXPORTS"], + textual_hdrs = [ + # These are textual because config macros influence them: + "hwy/detect_targets.h", # private + "hwy/targets.h", + # This .cc file #includes itself through foreach_target.h + "hwy/per_target.cc", + # End of list + "hwy/highway.h", # public + "hwy/foreach_target.h", # public + "hwy/per_target.h", # public + "hwy/print-inl.h", # public + "hwy/highway_export.h", # public + "hwy/ops/arm_neon-inl.h", + "hwy/ops/arm_sve-inl.h", + "hwy/ops/emu128-inl.h", + "hwy/ops/generic_ops-inl.h", + "hwy/ops/scalar-inl.h", + "hwy/ops/set_macros-inl.h", + "hwy/ops/shared-inl.h", + "hwy/ops/x86_128-inl.h", + "hwy/ops/x86_256-inl.h", + "hwy/ops/x86_512-inl.h", + # Select avoids recompiling native arch if only non-native changed + ] + select({ + ":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"], + "//conditions:default": [], + }) + select({ + "@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"], + "//conditions:default": [], + }), +) + +cc_library( + name = "algo", + compatible_with = [], + copts = COPTS, + textual_hdrs = [ + "hwy/contrib/algo/copy-inl.h", + "hwy/contrib/algo/find-inl.h", + "hwy/contrib/algo/transform-inl.h", + ], + deps = [ + ":hwy", + ], +) + +cc_library( + name = "dot", + compatible_with = [], + copts = COPTS, + textual_hdrs = [ + "hwy/contrib/dot/dot-inl.h", + ], + deps = [ + ":hwy", + ], +) + +cc_library( + name = "image", + srcs = [ + "hwy/contrib/image/image.cc", + ], + hdrs = [ + "hwy/contrib/image/image.h", + ], + compatible_with = [], + copts = COPTS, + local_defines = ["hwy_contrib_EXPORTS"], + deps = [ + ":hwy", + ], +) + +cc_library( + name = "math", + compatible_with = [], + copts = COPTS, + textual_hdrs = [ + "hwy/contrib/math/math-inl.h", + ], + deps = [ + ":hwy", + ], +) + +# Everything required for tests that use Highway. +cc_library( + name = "hwy_test_util", + srcs = ["hwy/tests/test_util.cc"], + hdrs = ["hwy/tests/test_util.h"], + compatible_with = [], + copts = COPTS, + local_defines = ["hwy_test_EXPORTS"], + textual_hdrs = [ + "hwy/tests/test_util-inl.h", + "hwy/tests/hwy_gtest.h", + ], + # Must not depend on a gtest variant, which can conflict with the + # GUNIT_INTERNAL_BUILD_MODE defined by the test. + deps = [ + ":hwy", + ], +) + +cc_library( + name = "nanobenchmark", + srcs = ["hwy/nanobenchmark.cc"], + hdrs = ["hwy/nanobenchmark.h"], + compatible_with = [], + copts = COPTS, + local_defines = ["hwy_EXPORTS"], + deps = [":hwy"], +) + +cc_binary( + name = "benchmark", + srcs = ["hwy/examples/benchmark.cc"], + copts = COPTS, + deps = [ + ":hwy", + ":nanobenchmark", + ], +) + +cc_library( + name = "skeleton", + srcs = ["hwy/examples/skeleton.cc"], + hdrs = ["hwy/examples/skeleton.h"], + copts = COPTS, + local_defines = ["hwy_EXPORTS"], + textual_hdrs = ["hwy/examples/skeleton-inl.h"], + deps = [ + ":hwy", + ], +) + +cc_binary( + name = "list_targets", + srcs = ["hwy/tests/list_targets.cc"], + deps = [":hwy"], +) + +# path, name +HWY_TESTS = [ + ("hwy/contrib/algo/", "copy_test"), + ("hwy/contrib/algo/", "find_test"), + ("hwy/contrib/algo/", "transform_test"), + ("hwy/contrib/dot/", "dot_test"), + ("hwy/contrib/image/", "image_test"), + ("hwy/contrib/math/", "math_test"), + # contrib/sort has its own BUILD, we add it to GUITAR_TESTS. + ("hwy/examples/", "skeleton_test"), + ("hwy/", "nanobenchmark_test"), + ("hwy/", "aligned_allocator_test"), + ("hwy/", "base_test"), + ("hwy/", "highway_test"), + ("hwy/", "targets_test"), + ("hwy/tests/", "arithmetic_test"), + ("hwy/tests/", "blockwise_test"), + ("hwy/tests/", "blockwise_shift_test"), + ("hwy/tests/", "combine_test"), + ("hwy/tests/", "compare_test"), + ("hwy/tests/", "compress_test"), + ("hwy/tests/", "convert_test"), + ("hwy/tests/", "crypto_test"), + ("hwy/tests/", "demote_test"), + ("hwy/tests/", "float_test"), + ("hwy/tests/", "if_test"), + ("hwy/tests/", "interleaved_test"), + ("hwy/tests/", "logical_test"), + ("hwy/tests/", "mask_test"), + ("hwy/tests/", "mask_mem_test"), + ("hwy/tests/", "memory_test"), + ("hwy/tests/", "mul_test"), + ("hwy/tests/", "reduction_test"), + ("hwy/tests/", "reverse_test"), + ("hwy/tests/", "shift_test"), + ("hwy/tests/", "swizzle_test"), + ("hwy/tests/", "test_util_test"), +] + +HWY_TEST_COPTS = select({ + ":compiler_msvc": [], + "//conditions:default": [ + # gTest triggers this warning (which is enabled by the + # extra-semi in COPTS), so we need to disable it here, + # but it's still enabled for :hwy. + "-Wno-c++98-compat-extra-semi", + ], +}) + +HWY_TEST_DEPS = [ + ":algo", + ":dot", + ":hwy", + ":hwy_test_util", + ":image", + ":math", + ":nanobenchmark", + ":skeleton", + "//hwy/contrib/sort:vqsort", + "@com_google_googletest//:gtest_main", +] + +[ + [ + cc_test( + name = test, + size = "medium", + timeout = "long", # default moderate is not enough for math_test + srcs = [ + subdir + test + ".cc", + ], + copts = COPTS + HWY_TEST_COPTS, + features = select({ + "@platforms//cpu:riscv64": ["fully_static_link"], + "//conditions:default": [], + }), + linkopts = select({ + ":compiler_emscripten": [ + "-s ASSERTIONS=2", + "-s ENVIRONMENT=node,shell,web", + "-s ERROR_ON_UNDEFINED_SYMBOLS=1", + "-s DEMANGLE_SUPPORT=1", + "-s EXIT_RUNTIME=1", + "-s ALLOW_MEMORY_GROWTH=1", + "--pre-js $(location :preamble.js.lds)", + ], + "//conditions:default": [], + }), + linkstatic = select({ + "@platforms//cpu:riscv64": True, + "//conditions:default": False, + }), + local_defines = ["HWY_IS_TEST"], + # for test_suite. + tags = ["hwy_ops_test"], + deps = HWY_TEST_DEPS + select({ + ":compiler_emscripten": [":preamble.js.lds"], + "//conditions:default": [], + }), + ), + ] + for subdir, test in HWY_TESTS +] + +# For manually building the tests we define here (:all does not work in --config=msvc) +test_suite( + name = "hwy_ops_tests", + tags = ["hwy_ops_test"], +) + +# Placeholder for integration test, do not remove diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..b6b14ab --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,580 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.10) + +# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14. +if(POLICY CMP0083) + cmake_policy(SET CMP0083 NEW) +endif() + +# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported +# target "GTest::gtest_main"'. +if(POLICY CMP0111) + cmake_policy(SET CMP0111 OLD) +endif() + +project(hwy VERSION 1.0.2) # Keep in sync with highway.h version + +# Directly define the ABI version from the cmake project() version values: +set(LIBRARY_VERSION "${hwy_VERSION}") +set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR}) + +set(CMAKE_CXX_EXTENSIONS OFF) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +# Search for Atomics implementation: +find_package(Atomics REQUIRED) + +# Enabled PIE binaries by default if supported. +include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED) +if(CHECK_PIE_SUPPORTED) + check_pie_supported(LANGUAGES CXX) + if(CMAKE_CXX_LINK_PIE_SUPPORTED) + set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) + endif() +endif() + +include(GNUInstallDirs) + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() + +set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?") + +# Unconditionally adding -Werror risks breaking the build when new warnings +# arise due to compiler/platform changes. Enable this in CI/tests. +set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?") + +set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/") +set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples") +set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library") +set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests") + +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + "int main() { + #if !defined(__EMSCRIPTEN__) + static_assert(false, \"__EMSCRIPTEN__ is not defined\"); + #endif + return 0; + }" + HWY_EMSCRIPTEN +) + +check_cxx_source_compiles( + "int main() { + #if !defined(__riscv) + static_assert(false, \"__riscv is not defined\"); + #endif + return 0; + }" + HWY_RISCV +) + +if (HWY_ENABLE_CONTRIB) +# Glob all the traits so we don't need to modify this file when adding +# additional special cases. +file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc") +list(APPEND HWY_CONTRIB_SOURCES + hwy/contrib/dot/dot-inl.h + hwy/contrib/image/image.cc + hwy/contrib/image/image.h + hwy/contrib/math/math-inl.h + hwy/contrib/sort/shared-inl.h + hwy/contrib/sort/sorting_networks-inl.h + hwy/contrib/sort/traits-inl.h + hwy/contrib/sort/traits128-inl.h + hwy/contrib/sort/vqsort-inl.h + hwy/contrib/sort/vqsort.cc + hwy/contrib/sort/vqsort.h + hwy/contrib/algo/copy-inl.h + hwy/contrib/algo/find-inl.h + hwy/contrib/algo/transform-inl.h +) +endif() # HWY_ENABLE_CONTRIB + +set(HWY_SOURCES + hwy/aligned_allocator.cc + hwy/aligned_allocator.h + hwy/base.h + hwy/cache_control.h + hwy/detect_compiler_arch.h # private + hwy/detect_targets.h # private + hwy/foreach_target.h + hwy/highway.h + hwy/highway_export.h + hwy/nanobenchmark.cc + hwy/nanobenchmark.h + hwy/ops/arm_neon-inl.h + hwy/ops/arm_sve-inl.h + hwy/ops/emu128-inl.h + hwy/ops/generic_ops-inl.h + hwy/ops/rvv-inl.h + hwy/ops/scalar-inl.h + hwy/ops/set_macros-inl.h + hwy/ops/shared-inl.h + hwy/ops/wasm_128-inl.h + hwy/ops/x86_128-inl.h + hwy/ops/x86_256-inl.h + hwy/ops/x86_512-inl.h + hwy/per_target.cc + hwy/per_target.h + hwy/print-inl.h + hwy/print.cc + hwy/print.h + hwy/targets.cc + hwy/targets.h +) + +set(HWY_TEST_SOURCES + hwy/tests/hwy_gtest.h + hwy/tests/test_util-inl.h + hwy/tests/test_util.cc + hwy/tests/test_util.h +) + +if (MSVC) + set(HWY_FLAGS + # fix build error C1128 in blockwise*_test & arithmetic_test + /bigobj + ) +else() + set(HWY_FLAGS + # Avoid changing binaries based on the current time and date. + -Wno-builtin-macro-redefined + -D__DATE__="redacted" + -D__TIMESTAMP__="redacted" + -D__TIME__="redacted" + + # Optimizations + -fmerge-all-constants + + # Warnings + -Wall + -Wextra + # These are not included in Wall nor Wextra: + -Wconversion + -Wsign-conversion + -Wvla + -Wnon-virtual-dtor + ) + + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + list(APPEND HWY_FLAGS + -Wfloat-overflow-conversion + -Wfloat-zero-conversion + -Wfor-loop-analysis + -Wgnu-redeclared-enum + -Winfinite-recursion + -Wself-assign + -Wstring-conversion + -Wtautological-overlap-compare + -Wthread-safety-analysis + -Wundefined-func-template + + -fno-cxx-exceptions + -fno-slp-vectorize + -fno-vectorize + + # Use color in messages + -fdiagnostics-show-option -fcolor-diagnostics + ) + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0) + list(APPEND HWY_FLAGS -Wc++2a-extensions) + endif() + endif() + + if (WIN32) + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + list(APPEND HWY_FLAGS + -Wno-global-constructors + -Wno-language-extension-token + -Wno-used-but-marked-unused + -Wno-shadow-field-in-constructor + -Wno-unused-member-function + -Wno-unused-template + -Wno-c++98-compat-pedantic + -Wno-used-but-marked-unused + -Wno-zero-as-null-pointer-constant + ) + endif() + + list(APPEND HWY_FLAGS + -Wno-cast-align + -Wno-double-promotion + -Wno-float-equal + -Wno-format-nonliteral + -Wno-shadow + -Wno-sign-conversion + ) + else() + list(APPEND HWY_FLAGS + -fmath-errno + -fno-exceptions + ) + endif() # WIN32 + + if (HWY_CMAKE_ARM7) + list(APPEND HWY_FLAGS + -march=armv7-a + -mfpu=neon-vfpv4 + -mfloat-abi=hard # must match the toolchain specified as CXX= + -mfp16-format=ieee # required for vcvt_f32_f16 + ) + endif() # HWY_CMAKE_ARM7 + + if(HWY_RISCV) + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + # Not yet supported by GCC. When runtime dispatch is supported and + # implemented, we will remove v from the required flags. Until then, using + # clang for RISC-V will require the CPU to support the V extension (1.0). + list(APPEND HWY_FLAGS -march=rv64gcv1p0) + list(APPEND HWY_FLAGS -menable-experimental-extensions) + endif() + endif() + + if (HWY_WARNINGS_ARE_ERRORS) + list(APPEND HWY_FLAGS -Werror) + endif() + + # Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o + # because it was not compiled with 'atomics' or 'bulk-memory' features." + if (HWY_EMSCRIPTEN) + list(APPEND HWY_FLAGS -matomics) + endif() + +endif() # !MSVC + +# By default prefer STATIC build (legacy behavior) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF) +# only expose shared/static options to advanced users: +mark_as_advanced(BUILD_SHARED_LIBS) +mark_as_advanced(HWY_FORCE_STATIC_LIBS) +# Define visibility settings globally: +set(CMAKE_CXX_VISIBILITY_PRESET hidden) +set(CMAKE_VISIBILITY_INLINES_HIDDEN 1) + +# Copy-cat "add_library" logic + add override. +set(HWY_LIBRARY_TYPE "SHARED") +if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS) + set(HWY_LIBRARY_TYPE "STATIC") +endif() + +# This preprocessor define will drive the build, also used in the *.pc files: +if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED") + set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE") +else() + set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE") +endif() + +add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES}) +target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}") +target_compile_options(hwy PRIVATE ${HWY_FLAGS}) +set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) +target_include_directories(hwy PUBLIC + $ + $) +target_compile_features(hwy PUBLIC cxx_std_11) +set_target_properties(hwy PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) +# For GCC __atomic_store_8, see #887 +target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES}) +if(UNIX AND NOT APPLE) + # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) + set_property(TARGET hwy APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") +endif() + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown") + # uname -p is broken on this system. Try uname -m + EXECUTE_PROCESS( COMMAND uname -m + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + OUTPUT_VARIABLE HWY_ARCH) +else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown") + set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown") +message(STATUS "Architecture: " ${HWY_ARCH}) +if (HWY_ARCH MATCHES "mips") + target_link_options(hwy PUBLIC "LINKER:-z,noexecstack") +endif (HWY_ARCH MATCHES "mips") + + +if (HWY_ENABLE_CONTRIB) +add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES}) +target_link_libraries(hwy_contrib hwy) +target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS}) +set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) +target_include_directories(hwy_contrib PUBLIC + $ + $) +target_compile_features(hwy_contrib PUBLIC cxx_std_11) +set_target_properties(hwy_contrib PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) +# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) +if(UNIX AND NOT APPLE) + set_property(TARGET hwy_contrib APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") +endif() +endif() # HWY_ENABLE_CONTRIB + +add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES}) +target_link_libraries(hwy_test hwy) +target_compile_options(hwy_test PRIVATE ${HWY_FLAGS}) +set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) +target_include_directories(hwy_test PUBLIC + $ + $) +target_compile_features(hwy_test PUBLIC cxx_std_11) +set_target_properties(hwy_test PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) +# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) +if(UNIX AND NOT APPLE) + set_property(TARGET hwy_test APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") +endif() + +# -------------------------------------------------------- hwy_list_targets +# Generate a tool to print the compiled-in targets as defined by the current +# flags. This tool will print to stderr at build time, after building hwy. +add_executable(hwy_list_targets hwy/tests/list_targets.cc) +target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS}) +target_link_libraries(hwy_list_targets hwy) +target_include_directories(hwy_list_targets PRIVATE + $) +# TARGET_FILE always returns the path to executable +# Naked target also not always could be run (due to the lack of '.\' prefix) +# Thus effective command to run should contain the full path +# and emulator prefix (if any). +if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR) +add_custom_command(TARGET hwy_list_targets POST_BUILD + COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $ || (exit 0)) +endif() + +# -------------------------------------------------------- +# Allow skipping the following sections for projects that do not need them: +# tests, examples, benchmarks and installation. + +# -------------------------------------------------------- install library +if (HWY_ENABLE_INSTALL) + +install(TARGETS hwy + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") +# Install all the headers keeping the relative path to the current directory +# when installing them. +foreach (source ${HWY_SOURCES}) + if ("${source}" MATCHES "\.h$") + get_filename_component(dirname "${source}" DIRECTORY) + install(FILES "${source}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") + endif() +endforeach() + +if (HWY_ENABLE_CONTRIB) +install(TARGETS hwy_contrib + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") +# Install all the headers keeping the relative path to the current directory +# when installing them. +foreach (source ${HWY_CONTRIB_SOURCES}) + if ("${source}" MATCHES "\.h$") + get_filename_component(dirname "${source}" DIRECTORY) + install(FILES "${source}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") + endif() +endforeach() +endif() # HWY_ENABLE_CONTRIB + +install(TARGETS hwy_test + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") +# Install all the headers keeping the relative path to the current directory +# when installing them. +foreach (source ${HWY_TEST_SOURCES}) + if ("${source}" MATCHES "\.h$") + get_filename_component(dirname "${source}" DIRECTORY) + install(FILES "${source}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") + endif() +endforeach() + +# Add a pkg-config file for libhwy and the contrib/test libraries. +set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}") +set(HWY_PC_FILES libhwy.pc libhwy-test.pc) +if (HWY_ENABLE_CONTRIB) +list(APPEND HWY_PC_FILES libhwy-contrib.pc) +endif() # HWY_ENABLE_CONTRIB +foreach (pc ${HWY_PC_FILES}) + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") +endforeach() + +endif() # HWY_ENABLE_INSTALL +# -------------------------------------------------------- Examples +if (HWY_ENABLE_EXAMPLES) + +# Avoids mismatch between GTest's static CRT and our dynamic. +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +# Programming exercise with integrated benchmark +add_executable(hwy_benchmark hwy/examples/benchmark.cc) +target_sources(hwy_benchmark PRIVATE + hwy/nanobenchmark.h) +# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or +# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed. +target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS}) +target_link_libraries(hwy_benchmark hwy) +set_target_properties(hwy_benchmark + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/") + +endif() # HWY_ENABLE_EXAMPLES +# -------------------------------------------------------- Tests + +include(CTest) + +if(BUILD_TESTING AND HWY_ENABLE_TESTS) +enable_testing() +include(GoogleTest) + +set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?") +if(HWY_SYSTEM_GTEST) +find_package(GTest REQUIRED) +else() +# Download and unpack googletest at configure time +configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) +execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") +endif() +execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") +endif() + +# Prevent overriding the parent project's compiler/linker +# settings on Windows +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +# Add googletest directly to our build. This defines +# the gtest and gtest_main targets. +add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src + ${CMAKE_CURRENT_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) +endif() # HWY_SYSTEM_GTEST + +set(HWY_TEST_FILES + hwy/contrib/algo/copy_test.cc + hwy/contrib/algo/find_test.cc + hwy/contrib/algo/transform_test.cc + hwy/aligned_allocator_test.cc + hwy/base_test.cc + hwy/highway_test.cc + hwy/nanobenchmark_test.cc + hwy/targets_test.cc + hwy/examples/skeleton_test.cc + hwy/tests/arithmetic_test.cc + hwy/tests/blockwise_test.cc + hwy/tests/blockwise_shift_test.cc + hwy/tests/combine_test.cc + hwy/tests/compare_test.cc + hwy/tests/compress_test.cc + hwy/tests/convert_test.cc + hwy/tests/crypto_test.cc + hwy/tests/demote_test.cc + hwy/tests/float_test.cc + hwy/tests/if_test.cc + hwy/tests/interleaved_test.cc + hwy/tests/logical_test.cc + hwy/tests/mask_test.cc + hwy/tests/mask_mem_test.cc + hwy/tests/memory_test.cc + hwy/tests/mul_test.cc + hwy/tests/reduction_test.cc + hwy/tests/reverse_test.cc + hwy/tests/shift_test.cc + hwy/tests/swizzle_test.cc + hwy/tests/test_util_test.cc +) + +set(HWY_TEST_LIBS hwy hwy_test) + +if (HWY_ENABLE_CONTRIB) +list(APPEND HWY_TEST_LIBS hwy_contrib) + +list(APPEND HWY_TEST_FILES + hwy/contrib/dot/dot_test.cc + hwy/contrib/image/image_test.cc + # Disabled due to SIGILL in clang7 debug build during gtest discovery phase, + # not reproducible locally. Still tested via bazel build. + # hwy/contrib/math/math_test.cc + hwy/contrib/sort/sort_test.cc +) +endif() # HWY_ENABLE_CONTRIB + +if(HWY_SYSTEM_GTEST) + if (CMAKE_VERSION VERSION_LESS 3.20) + set(HWY_GTEST_LIBS GTest::GTest GTest::Main) + else() + set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main) + endif() +else() + set(HWY_GTEST_LIBS gtest gtest_main) +endif() + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests) +foreach (TESTFILE IN LISTS HWY_TEST_FILES) + # The TESTNAME is the name without the extension or directory. + get_filename_component(TESTNAME ${TESTFILE} NAME_WE) + add_executable(${TESTNAME} ${TESTFILE}) + target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS}) + # Test all targets, not just the best/baseline. This changes the default + # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can + # cause compile errors because only one may be set, and other CMakeLists.txt + # that include us may set them. + target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1) + + target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS}) + # Output test targets in the test directory. + set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests") + + if (HWY_EMSCRIPTEN) + set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1") + endif() + + if(${CMAKE_VERSION} VERSION_LESS "3.10.3") + gtest_discover_tests(${TESTNAME} TIMEOUT 60) + else () + gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60) + endif () +endforeach () + +# The skeleton test uses the skeleton library code. +target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc) + +endif() # BUILD_TESTING diff --git a/CMakeLists.txt.in b/CMakeLists.txt.in new file mode 100644 index 0000000..a0260b8 --- /dev/null +++ b/CMakeLists.txt.in @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 2.8.12) + +project(googletest-download NONE) + +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG 43efa0a4efd40c78b9210d15373112081899a97c + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) diff --git a/CONTRIBUTING b/CONTRIBUTING new file mode 100644 index 0000000..8b7d4d2 --- /dev/null +++ b/CONTRIBUTING @@ -0,0 +1,33 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Testing + +This repository is used by JPEG XL, so major API changes will require +coordination. Please get in touch with us beforehand, e.g. by raising an issue. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f49a4e1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..969f329 --- /dev/null +++ b/README.md @@ -0,0 +1,322 @@ +# Efficient and performance-portable vector software + +[//]: # (placeholder, do not remove) + +Highway is a C++ library that provides portable SIMD/vector intrinsics. + +## Why + +We are passionate about high-performance software. We see major untapped +potential in CPUs (servers, mobile, desktops). Highway is for engineers who want +to reliably and economically push the boundaries of what is possible in +software. + +## How + +CPUs provide SIMD/vector instructions that apply the same operation to multiple +data items. This can reduce energy usage e.g. *fivefold* because fewer +instructions are executed. We also often see *5-10x* speedups. + +Highway makes SIMD/vector programming practical and workable according to these +guiding principles: + +**Does what you expect**: Highway is a C++ library with carefully-chosen +functions that map well to CPU instructions without extensive compiler +transformations. The resulting code is more predictable and robust to code +changes/compiler updates than autovectorization. + +**Works on widely-used platforms**: Highway supports four architectures; the +same application code can target eight instruction sets, including those with +'scalable' vectors (size unknown at compile time). Highway only requires C++11 +and supports four families of compilers. If you would like to use Highway on +other platforms, please raise an issue. + +**Flexible to deploy**: Applications using Highway can run on heterogeneous +clouds or client devices, choosing the best available instruction set at +runtime. Alternatively, developers may choose to target a single instruction set +without any runtime overhead. In both cases, the application code is the same +except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one +line of code. + +**Suitable for a variety of domains**: Highway provides an extensive set of +operations, used for image processing (floating-point), compression, video +analysis, linear algebra, cryptography, sorting and random generation. We +recognise that new use-cases may require additional ops and are happy to add +them where it makes sense (e.g. no performance cliffs on some architectures). If +you would like to discuss, please file an issue. + +**Rewards data-parallel design**: Highway provides tools such as Gather, +MaskedLoad, and FixedTag to enable speedups for legacy data structures. However, +the biggest gains are unlocked by designing algorithms and data structures for +scalable vectors. Helpful techniques include batching, structure-of-array +layouts, and aligned/padded allocations. + +## Examples + +Online demos using Compiler Explorer: + +- [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf) + (recommended) +- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG) + +Projects using Highway: (to add yours, feel free to raise an issue or contact us +via the below email) + +* [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp) +* [JPEG XL image codec](https://github.com/libjxl/libjxl) +* [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok) +* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982)) + +## Current status + +### Targets + +Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake, +requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2, +WASM SIMD, RISC-V V. + +SVE was initially tested using farm_sve (see acknowledgments). + +### Versioning + +Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH), +incrementing MINOR after backward-compatible additions and PATCH after +backward-compatible fixes. We recommend using releases (rather than the Git tip) +because they are tested more extensively, see below. + +The current version 1.0 signals an increased focus on backwards compatibility. +Applications using documented functionality will remain compatible with future +updates that have the same major version number. + +### Testing + +Continuous integration tests build with a recent version of Clang (running on +native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native +x86). + +Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC +cross-compile. See the [testing process](g3doc/release_testing_process.md) for +details. + +### Related modules + +The `contrib` directory contains SIMD-related utilities: an image class with +aligned rows, a math library (16 functions already implemented, mostly +trigonometry), and functions for computing dot products and sorting. + +## Installation + +This project uses CMake to generate and build. In a Debian-based system you can +install it via: + +```bash +sudo apt install cmake +``` + +Highway's unit tests use [googletest](https://github.com/google/googletest). +By default, Highway's CMake downloads this dependency at configuration time. +You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and +installing gtest separately: + +```bash +sudo apt install libgtest-dev +``` + +To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS), +the standard CMake workflow can be used: + +```bash +mkdir -p build && cd build +cmake .. +make -j && make test +``` + +Or you can run `run_tests.sh` (`run_tests.bat` on Windows). + +Bazel is also supported for building, but it is not as widely used/tested. + +## Quick start + +You can use the `benchmark` inside examples/ as a starting point. + +A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations +and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf) +indicates the number of instructions per operation. + +The [FAQ](g3doc/faq.md) answers questions about portability, API design and +where to find more information. + +We recommend using full SIMD vectors whenever possible for maximum performance +portability. To obtain them, pass a `ScalableTag` (or equivalently +`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two +alternatives for use-cases requiring an upper bound on the lanes: + +- For up to `N` lanes, specify `CappedTag` or the equivalent + `HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to + the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is + useful for data structures such as a narrow matrix. A loop is still required + because vectors may actually have fewer than `N` lanes. + +- For exactly a power of two `N` lanes, specify `FixedTag`. The largest + supported `N` depends on the target, but is guaranteed to be at least + `16/sizeof(T)`. + +Due to ADL restrictions, user code calling Highway ops must either: +* Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or +* prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE; + hn::Add()`; or +* add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`. + +Additionally, each function that calls Highway ops (such as `Load`) must either +be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and +`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before +their opening brace. + +The entry points into code using Highway differ slightly depending on whether +they use static or dynamic dispatch. + +* For static dispatch, `HWY_TARGET` will be the best available target among + `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see + [quick-reference](g3doc/quick_reference.md)). Functions inside + `HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within + the same module they are defined in. You can call the function from other + modules by wrapping it in a regular function and declaring the regular + function in a header. + +* For dynamic dispatch, a table of function pointers is generated via the + `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to + call the best function pointer for the current CPU's supported targets. A + module is automatically compiled for each target in `HWY_TARGETS` (see + [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is + defined and `foreach_target.h` is included. + +When using dynamic dispatch, `foreach_target.h` is included from translation +units (.cc files), not headers. Headers containing vector code shared between +several translation units require a special include guard, for example the +following taken from `examples/skeleton-inl.h`: + +``` +#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#else +#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#endif + +#include "hwy/highway.h" +// Your vector code +#endif +``` + +By convention, we name such headers `-inl.h` because their contents (often +function templates) are usually inlined. + +## Compiler flags + +Applications should be compiled with optimizations enabled - without inlining, +SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is +generally sufficient. + +For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to +pass vector arguments in registers. If intending to use the AVX2 target together +with half-width vectors (e.g. for `PromoteTo`), it is also important to compile +with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4 +instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and +non-VEX SSE4 may cause severe performance degradation. Unfortunately, the +resulting binary will then require AVX2. Note that no such flag is needed for +clang and GCC because they support target-specific attributes, which we use to +ensure proper VEX code generation for AVX2 targets. + +## Strip-mining loops + +To vectorize a loop, "strip-mining" transforms it into an outer loop and inner +loop with number of iterations matching the preferred vector width. + +In this section, let `T` denote the element type, `d = ScalableTag`, `count` +the number of elements to process, and `N = Lanes(d)` the number of lanes in a +full vector. Assume the loop body is given as a function `template void LoopBody(D d, size_t index, size_t max_n)`. + +Highway offers several ways to express loops where `N` need not divide `count`: + +* Ensure all inputs/outputs are padded. Then the loop is simply + + ``` + for (size_t i = 0; i < count; i += N) LoopBody(d, i, 0); + ``` + Here, the template parameter and second function argument are not needed. + + This is the preferred option, unless `N` is in the thousands and vector + operations are pipelined with long latencies. This was the case for + supercomputers in the 90s, but nowadays ALUs are cheap and we see most + implementations split vectors into 1, 2 or 4 parts, so there is little cost + to processing entire vectors even if we do not need all their lanes. Indeed + this avoids the (potentially large) cost of predication or partial + loads/stores on older targets, and does not duplicate code. + +* Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This + takes care of the loop and remainder handling and you simply define a + generic lambda function (C++14) or functor which receives the current vector + from the input/output array, plus optionally vectors from up to two extra + input arrays, and returns the value to write to the input/output array. + + Here is an example implementing the BLAS function SAXPY (`alpha * x + y`): + + ``` + Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR { + return MulAdd(Set(d, alpha), v, v1); + }); + ``` + +* Process whole vectors as above, followed by a scalar loop: + + ``` + size_t i = 0; + for (; i + N <= count; i += N) LoopBody(d, i, 0); + for (; i < count; ++i) LoopBody(CappedTag(), i, 0); + ``` + The template parameter and second function arguments are again not needed. + + This avoids duplicating code, and is reasonable if `count` is large. + If `count` is small, the second loop may be slower than the next option. + +* Process whole vectors as above, followed by a single call to a modified + `LoopBody` with masking: + + ``` + size_t i = 0; + for (; i + N <= count; i += N) { + LoopBody(d, i, 0); + } + if (i < count) { + LoopBody(d, i, count - i); + } + ``` + Now the template parameter and third function argument can be used inside + `LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v` + with the previous contents of memory at subsequent locations: + `BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly, + `MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first + `num_remaining` elements and returns zero in other lanes. + + This is a good default when it is infeasible to ensure vectors are padded, + but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`! + In contrast to the scalar loop, only a single final iteration is needed. + The increased code size from two loop bodies is expected to be worthwhile + because it avoids the cost of masking in all but the final iteration. + +## Additional resources + +* [Highway introduction (slides)](g3doc/highway_intro.pdf) +* [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf) +* [Design philosophy and comparison](g3doc/design_philosophy.md) +* [Implementation details](g3doc/impl_details.md) + +## Acknowledgments + +We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger +Bramas; it has proved useful for checking the SVE port on an x86 development +machine. + +This is not an officially supported Google product. +Contact: janwas@google.com diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..6df1f62 --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,24 @@ +workspace(name = "highway") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +http_archive( + name = "com_google_googletest", + urls = ["https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip"], + sha256 = "5cf189eb6847b4f8fc603a3ffff3b0771c08eec7dd4bd961bfd45477dd13eb73", + strip_prefix = "googletest-609281088cfefc76f9d0ce82e1ff6c30cc3591e5", +) + +# See https://google.github.io/googletest/quickstart-bazel.html +http_archive( + name = "rules_cc", + urls = ["https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.zip"], + sha256 = "56ac9633c13d74cb71e0546f103ce1c58810e4a76aa8325da593ca4277908d72", + strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556", +) + +# Need recent version for config_setting_group +http_archive( + name = "bazel_skylib", + urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz"], +) diff --git a/cmake/FindAtomics.cmake b/cmake/FindAtomics.cmake new file mode 100644 index 0000000..e866b73 --- /dev/null +++ b/cmake/FindAtomics.cmake @@ -0,0 +1,56 @@ +# Original issue: +# * https://gitlab.kitware.com/cmake/cmake/-/issues/23021#note_1098733 +# +# For reference: +# * https://gcc.gnu.org/wiki/Atomic/GCCMM +# +# riscv64 specific: +# * https://lists.debian.org/debian-riscv/2022/01/msg00009.html +# +# ATOMICS_FOUND - system has c++ atomics +# ATOMICS_LIBRARIES - libraries needed to use c++ atomics + +include(CheckCXXSourceCompiles) + +# RISC-V only has 32-bit and 64-bit atomic instructions. GCC is supposed +# to convert smaller atomics to those larger ones via masking and +# shifting like LLVM, but it’s a known bug that it does not. This means +# anything that wants to use atomics on 1-byte or 2-byte types needs +# -latomic, but not 4-byte or 8-byte (though it does no harm). +set(atomic_code + " + #include + #include + std::atomic n8 (0); // riscv64 + std::atomic n64 (0); // armel, mipsel, powerpc + int main() { + ++n8; + ++n64; + return 0; + }") + +# https://gitlab.kitware.com/cmake/cmake/-/issues/24063 +set(CMAKE_CXX_STANDARD 11) +check_cxx_source_compiles("${atomic_code}" ATOMICS_LOCK_FREE_INSTRUCTIONS) + +if(ATOMICS_LOCK_FREE_INSTRUCTIONS) + set(ATOMICS_FOUND TRUE) + set(ATOMICS_LIBRARIES) +else() + set(CMAKE_REQUIRED_LIBRARIES "-latomic") + check_cxx_source_compiles("${atomic_code}" ATOMICS_IN_LIBRARY) + set(CMAKE_REQUIRED_LIBRARIES) + if(ATOMICS_IN_LIBRARY) + set(ATOMICS_LIBRARY atomic) + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Atomics DEFAULT_MSG ATOMICS_LIBRARY) + set(ATOMICS_LIBRARIES ${ATOMICS_LIBRARY}) + unset(ATOMICS_LIBRARY) + else() + if(Atomics_FIND_REQUIRED) + message(FATAL_ERROR "Neither lock free instructions nor -latomic found.") + endif() + endif() +endif() +unset(atomic_code) +unset(CMAKE_CXX_STANDARD) diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..36d0c1d --- /dev/null +++ b/debian/changelog @@ -0,0 +1,157 @@ +highway (1.0.2-1) UNRELEASED; urgency=medium + +* Add ExclusiveNeither, FindKnownFirstTrue, Ne128 +* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To +* Faster sort for low-entropy input, improved pivot selection +* Add GN build system, Highway FAQ, k32v32 type to vqsort +* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS +* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV +* Fix missing exec_prefix, RVV build, warnings, libatomic linking +* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3 +* Documentation/example improvements +* Support static dispatch to SVE2_128 and SVE_256 + + -- Jan Wassenberg Thu, 27 Oct 2022 17:00:00 +0200 + +highway (1.0.1-1) UNRELEASED; urgency=medium + +* Add Eq128, i64 Mul, unsigned->float ConvertTo +* Faster sort for few unique keys, more robust pivot selection +* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16 +* Fix: avoid always_inline in debug, link atomic +* GCC warnings: string.h, maybe-uninitialized, ignored-attributes +* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow +* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl + + -- Jan Wassenberg Tue, 23 Aug 2022 10:00:00 +0200 + +highway (1.0.0-1) UNRELEASED; urgency=medium + +* ABI change: 64-bit target values, more room for expansion +* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo +* Add HWY_SVE2_128 target +* Sort speedups especially for 128-bit +* Documentation clarifications +* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue +* Improved SVE codegen +* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt +* MSVC 2017 workarounds +* Support for runtime dispatch on Arm/GCC/Linux + + -- Jan Wassenberg Wed, 27 Jul 2022 10:00:00 +0200 + +highway (0.17.0-1) UNRELEASED; urgency=medium + +* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN +* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN +* Add MulFixedPoint15, Or3 +* Add Copy[If], Find[If], Generate, Replace[If] algos +* Add HWY_EMU128 target (replaces HWY_SCALAR) +* HWY_RVV is feature-complete +* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros +* Extend ConcatOdd/Even and StoreInterleaved* to all types +* Allow CappedTag +* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc +* Expand documentation +* Fix RDTSCP crash in nanobenchmark +* Fix XCR0 check (was ignoring AVX3 on ICL) +* Support Arm/RISC-V timers + + -- Jan Wassenberg Fri, 20 May 2022 10:00:00 +0200 + +highway (0.16.0-1) UNRELEASED; urgency=medium + + * Add contrib/sort (vectorized quicksort) + * Add IfNegativeThenElse, IfVecThenElse + * Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound + * Add OrAnd, Min128, Max128, Lt128, SumsOf8 + * Support capped/partial vectors on RVV/SVE, int64 in WASM + * Support SVE2, shared library build + * Remove deprecated overloads without the required d arg (UpperHalf etc.) + + -- Jan Wassenberg Thu, 03 Feb 2022 11:00:00 +0100 + +highway (0.15.0-1) UNRELEASED; urgency=medium + + * New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec + * New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight + * Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes + * Contrib: add sort(ing network) and dot(product) + * Targets: update RVV for LLVM, add experimental WASM2 + * Separate library hwy_test for test utils + * Add non-macro Simd<> aliases + * Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu + + -- Jan Wassenberg Wed, 10 Nov 2021 10:00:00 +0100 + +highway (0.14.2-1) UNRELEASED; urgency=medium + + * Add MaskedLoad + * Fix non-glibc PPC, Windows GCC, MSVC 19.14 + * Opt-in for -Werror; separate design_philosophy.md + + -- Jan Wassenberg Tue, 24 Aug 2021 15:00:00 +0200 + +highway (0.14.1-1) UNRELEASED; urgency=medium + + * Add LoadMaskBits, CompressBits[Store] + * Fix CPU feature check (AES/F16C) and warnings + * Improved DASSERT - disabled in optimized builds + + -- Jan Wassenberg Tue, 17 Aug 2021 14:00:00 +0200 + +highway (0.14.0-1) UNRELEASED; urgency=medium + + * Add SVE, S-SSE3, AVX3_DL targets + * Support partial vectors in all ops + * Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0 + * Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16 + + -- Jan Wassenberg Thu, 29 Jul 2021 15:00:00 +0200 + +highway (0.12.2-1) UNRELEASED; urgency=medium + + * fix scalar-only test and Windows macro conflict with Load/StoreFence + * replace deprecated wasm intrinsics + + -- Jan Wassenberg Mon, 31 May 2021 16:00:00 +0200 + +highway (0.12.1-1) UNRELEASED; urgency=medium + + * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors + * fix warnings, faster ARM div/sqrt, separate hwy_contrib library + * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC + + -- Jan Wassenberg Wed, 19 May 2021 15:00:00 +0200 + +highway (0.12.0-1) UNRELEASED; urgency=medium + + * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4 + * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES + * Proper IEEE rounding, reduce libstdc++ usage, inlined math + + -- Jan Wassenberg Thu, 15 Apr 2021 20:00:00 +0200 + +highway (0.11.1-1) UNRELEASED; urgency=medium + + * Fix clang7 asan error, finish f16 conversions and add test + + -- Jan Wassenberg Thu, 25 Feb 2021 16:00:00 +0200 + +highway (0.11.0-1) UNRELEASED; urgency=medium + + * Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math + + -- Jan Wassenberg Thu, 18 Feb 2021 20:00:00 +0200 + +highway (0.7.0-1) UNRELEASED; urgency=medium + + * Added API stability notice, Compress[Store], contrib/, SignBit, CopySign + + -- Jan Wassenberg Tue, 5 Jan 2021 17:00:00 +0200 + +highway (0.1-1) UNRELEASED; urgency=medium + + * Initial debian package. + + -- Alex Deymo Mon, 19 Oct 2020 16:48:07 +0200 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..f599e28 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +10 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..7c60ebc --- /dev/null +++ b/debian/control @@ -0,0 +1,23 @@ +Source: highway +Maintainer: JPEG XL Maintainers +Section: misc +Priority: optional +Standards-Version: 3.9.8 +Build-Depends: cmake, + debhelper (>= 9), + libgtest-dev +Homepage: https://github.com/google/highway + +Package: libhwy-dev +Architecture: any +Section: libdevel +Depends: ${misc:Depends} +Description: Efficient and performance-portable SIMD wrapper (developer files) + This library provides type-safe and source-code portable wrappers over + existing platform-specific intrinsics. Its design aims for simplicity, + reliable efficiency across platforms, and immediate usability with current + compilers. + . + This package installs the development files. There's no runtime library + since most of Highway is implemented in headers and only a very small + static library is needed. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..53ea57a --- /dev/null +++ b/debian/copyright @@ -0,0 +1,20 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: highway + +Files: * +Copyright: 2020 Google LLC +License: Apache-2.0 + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + . + On Debian systems, the complete text of the Apache License, Version 2 + can be found in "/usr/share/common-licenses/Apache-2.0". diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..969fc12 --- /dev/null +++ b/debian/rules @@ -0,0 +1,6 @@ +#!/usr/bin/make -f +%: + dh $@ --buildsystem=cmake + +override_dh_auto_configure: + dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/g3doc/design_philosophy.md b/g3doc/design_philosophy.md new file mode 100644 index 0000000..10fff8e --- /dev/null +++ b/g3doc/design_philosophy.md @@ -0,0 +1,186 @@ +# Design philosophy + +* Performance is important but not the sole consideration. Anyone who goes to + the trouble of using SIMD clearly cares about speed. However, portability, + maintainability and readability also matter, otherwise we would write in + assembly. We aim for performance within 10-20% of a hand-written assembly + implementation on the development platform. There is no performance gap vs. + intrinsics: Highway code can do anything they can. If necessary, you can use + platform-specific instructions inside `#if HWY_TARGET == HWY_NEON` etc. + +* The guiding principles of C++ are "pay only for what you use" and "leave no + room for a lower-level language below C++". We apply these by defining a + SIMD API that ensures operation costs are visible, predictable and minimal. + +* Performance portability is important, i.e. the API should be efficient on + all target platforms. Unfortunately, common idioms for one platform can be + inefficient on others. For example: summing lanes horizontally versus + shuffling. Documenting which operations are expensive does not prevent their + use, as evidenced by widespread use of `HADDPS`. Performance acceptance + tests may detect large regressions, but do not help choose the approach + during initial development. Analysis tools can warn about some potential + inefficiencies, but likely not all. We instead provide [a carefully chosen + set of vector types and operations that are efficient on all target + platforms](instruction_matrix.pdf) (PPC8, SSE4/AVX2+, ARMv8). + +* Future SIMD hardware features are difficult to predict. For example, AVX2 + came with surprising semantics (almost no interaction between 128-bit + blocks) and AVX-512 added two kinds of predicates (writemask and zeromask). + To ensure the API reflects hardware realities, we suggest a flexible + approach that adds new operations as they become commonly available, with + fallback implementations where necessary. + +* Masking/predication differs between platforms, and it is not clear how + important the use cases are beyond the ternary operator `IfThenElse`. + AVX-512/ARM SVE zeromasks are useful, but not supported by P0214R5. We + provide `IfThen[Zero]Else[Zero]` variants. + +* "Width-agnostic" SIMD is more future-proof than user-specified fixed sizes. + For example, valarray-like code can iterate over a 1D array with a + library-specified vector width. This will result in better code when vector + sizes increase, and matches the direction taken by + [ARM SVE](https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf) and + RiscV V as well as Agner Fog's + [ForwardCom instruction set proposal](https://goo.gl/CFizWu). However, some + applications may require fixed sizes, so we also guarantee support for <= + 128-bit vectors in each instruction set. + +* The API and its implementation should be usable and efficient with commonly + used compilers, including MSVC. For example, we write `ShiftLeft<3>(v)` + instead of `v << 3` because MSVC 2017 (ARM64) does not propagate the literal + (https://godbolt.org/g/rKx5Ga). Highway requires function-specific target + attributes, supported by GCC 4.9 / Clang 3.9 / MSVC 2015. + +* Efficient and safe runtime dispatch is important. Modules such as image or + video codecs are typically embedded into larger applications such as + browsers, so they cannot require separate binaries for each CPU. Libraries + also cannot predict whether the application already uses AVX2 (and pays the + frequency throttling cost), so this decision must be left to the + application. Using only the lowest-common denominator instructions + sacrifices too much performance. Therefore, we provide code paths for + multiple instruction sets and choose the most suitable at runtime. To reduce + overhead, dispatch should be hoisted to higher layers instead of checking + inside every low-level function. Highway supports inlining functions in the + same file or in `*-inl.h` headers. We generate all code paths from the same + source to reduce implementation- and debugging cost. + +* Not every CPU need be supported. For example, pre-SSSE3 CPUs are + increasingly rare and the AVX instruction set is limited to floating-point + operations. To reduce code size and compile time, we provide specializations + for S-SSE3, SSE4, AVX2 and AVX-512 instruction sets on x86, plus a scalar + fallback. + +* Access to platform-specific intrinsics is necessary for acceptance in + performance-critical projects. We provide conversions to and from intrinsics + to allow utilizing specialized platform-specific functionality, and simplify + incremental porting of existing code. + +* The core API should be compact and easy to learn; we provide a [concise + reference](quick_reference.md). + +## Prior API designs + +The author has been writing SIMD code since 2002: first via assembly language, +then intrinsics, later Intel's `F32vec4` wrapper, followed by three generations +of custom vector classes. The first used macros to generate the classes, which +reduces duplication but also readability. The second used templates instead. +The third (used in highwayhash and PIK) added support for AVX2 and runtime +dispatch. The current design (used in JPEG XL) enables code generation for +multiple platforms and/or instruction sets from the same source, and improves +runtime dispatch. + +## Overloaded function API + +Most C++ vector APIs rely on class templates. However, the ARM SVE vector type +is sizeless and cannot be wrapped in a class. We instead rely on overloaded +functions. Overloading based on vector types is also undesirable because SVE +vectors cannot be default-constructed. We instead use a dedicated tag type +`Simd` for overloading, abbreviated to `D` for template arguments and `d` in +lvalues. + +Note that generic function templates are possible (see generic_ops-inl.h). + +## Masks + +AVX-512 introduced a major change to the SIMD interface: special mask registers +(one bit per lane) that serve as predicates. It would be expensive to force +AVX-512 implementations to conform to the prior model of full vectors with lanes +set to all one or all zero bits. We instead provide a Mask type that emulates +a subset of this functionality on other platforms at zero cost. + +Masks are returned by comparisons and `TestBit`; they serve as the input to +`IfThen*`. We provide conversions between masks and vector lanes. For clarity +and safety, we use FF..FF as the definition of true. To also benefit from +x86 instructions that only require the sign bit of floating-point inputs to be +set, we provide a special `ZeroIfNegative` function. + +## Differences vs. [P0214R5](https://goo.gl/zKW4SA) / std::experimental::simd + +1. Allowing the use of built-in vector types by relying on non-member + functions. By contrast, P0214R5 requires a wrapper class, which does not + work for sizeless vector types currently used by ARM SVE and Risc-V. + +1. Adding widely used and portable operations such as `AndNot`, `AverageRound`, + bit-shift by immediates and `IfThenElse`. + +1. Designing the API to avoid or minimize overhead on AVX2/AVX-512 caused by + crossing 128-bit 'block' boundaries. + +1. Avoiding the need for non-native vectors. By contrast, P0214R5's `simd_cast` + returns `fixed_size<>` vectors which are more expensive to access because + they reside on the stack. We can avoid this plus additional overhead on + ARM/AVX2 by defining width-expanding operations as functions of a vector + part, e.g. promoting half a vector of `uint8_t` lanes to one full vector of + `uint16_t`, or demoting full vectors to half vectors with half-width lanes. + +1. Guaranteeing access to the underlying intrinsic vector type. This ensures + all platform-specific capabilities can be used. P0214R5 instead only + 'encourages' implementations to provide access. + +1. Enabling safe runtime dispatch and inlining in the same binary. P0214R5 is + based on the Vc library, which does not provide assistance for linking + multiple instruction sets into the same binary. The Vc documentation + suggests compiling separate executables for each instruction set or using + GCC's ifunc (indirect functions). The latter is compiler-specific and risks + crashes due to ODR violations when compiling the same function with + different compiler flags. We solve this problem via target-specific + namespaces and attributes (see HOWTO section below). We also permit a mix of + static target selection and runtime dispatch for hotspots that may benefit + from newer instruction sets if available. + +1. Omitting inefficient or non-performance-portable operations such as `hmax`, + `operator[]`, and unsupported integer comparisons. Applications can often + replace these operations at lower cost than emulating that exact behavior. + +1. Omitting `long double` types: these are not commonly available in hardware. + +1. Ensuring signed integer overflow has well-defined semantics (wraparound). + +1. Simple header-only implementation and a fraction of the size of the + Vc library from which P0214 was derived (39K, vs. 92K lines in + https://github.com/VcDevel/Vc according to the gloc Chrome extension). + +1. Avoiding hidden performance costs. P0214R5 allows implicit conversions from + integer to float, which costs 3-4 cycles on x86. We make these conversions + explicit to ensure their cost is visible. + +## Other related work + +* [Neat SIMD](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7568423) + adopts a similar approach with interchangeable vector/scalar types and + a compact interface. It allows access to the underlying intrinsics, but + does not appear to be designed for other platforms than x86. + +* UME::SIMD ([code](https://goo.gl/yPeVZx), [paper](https://goo.gl/2xpZrk)) + also adopts an explicit vectorization model with vector classes. + However, it exposes the union of all platform capabilities, which makes the + API harder to learn (209-page spec) and implement (the estimated LOC count + is [500K](https://goo.gl/1THFRi)). The API is less performance-portable + because it allows applications to use operations that are inefficient on + other platforms. + +* Inastemp ([code](https://goo.gl/hg3USM), [paper](https://goo.gl/YcTU7S)) + is a vector library for scientific computing with some innovative features: + automatic FLOPS counting, and "if/else branches" using lambda functions. + It supports IBM Power8, but only provides float and double types and does + not support SVE without assuming the runtime vector size. diff --git a/g3doc/faq.md b/g3doc/faq.md new file mode 100644 index 0000000..507c7c3 --- /dev/null +++ b/g3doc/faq.md @@ -0,0 +1,310 @@ +# Frequently Asked Questions + +[[TOC]] + +## Documentation + +Q1.1: How do I **find the Highway op name** corresponding to an existing +intrinsic? A: Search for the intrinsic in (for example) +[x86_128-inl.h](https://github.com/google/highway/blob/master/hwy/ops/x86_128-inl.h). +The Highway op is typically the name of the function that calls the intrinsic. +See also the +[quick reference](https://github.com/google/highway/blob/master/g3doc/quick_reference.md) +which lists all of the Highway ops. + +Q1.2: Are there **examples of porting intrinsics to Highway**? A: See +cl/448957386 and cl/450480902. + +Q1.3: Where do I find documentation for each **platform's intrinsics**? A: See +[Intel](https://www.intel.com/content/www/us/en/docs/intrinsics-guide), +[Arm NEON and SVE](https://developer.arm.com/architectures/instruction-sets/intrinsics), +[RISC-V V](https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc), +[WebAssembly](https://nemequ.github.io/waspr/intrinsics). + +Q1.4: Where do I find **instruction latency/throughput**? A: For x86, a +combination of [uops.info](https://www.uops.info/table.html) and +https://agner.org/optimize/, plus Intel's above intrinsics guide and +[AMD's sheet (zip file)](https://www.amd.com/system/files/TechDocs/56665.zip). +For Arm, the +[Software_Optimization_Guide](https://developer.arm.com/documentation/pjdoc466751330-9685/latest/) +for Neoverse V1 etc. For RISC-V, the vendor's tables (typically not publicly +available). + +Q1.5: Where can I find **inspiration for SIMD-friendly algorithms**? A: + +- [Algorithms for Modern Hardware online book](https://en.algorithmica.org/hpc/) +- [SIMD for C++ developers](http://const.me/articles/simd/simd.pdf) +- [Bit twiddling collection](https://graphics.stanford.edu/~seander/bithacks.html) +- [SIMD-within-a-register](http://aggregate.org/MAGIC/) +- Hacker's Delight book, which has a huge collection of bitwise identities, + but is written for hypothetical RISC CPUs, which differ in some ways from + the SIMD capabilities of current CPUs. + +Q1.6: How do I **predict performance**? A: The best approach by far is +end-to-end application benchmarking. Typical microbenchmarks are subject to +numerous pitfalls including unrealistic cache and branch predictor hit rates +(unless the benchmark randomizes its behavior). But sometimes we would like a +quick indication of whether a short piece of code runs efficiently on a given +CPU. Intel's IACA used to serve this purpose but has been discontinued. We now +recommend llvm-mca, +[integrated into Compiler Explorer](https://gcc.godbolt.org/z/n-KcQ-). This +shows the predicted throughput and the pressure on the various functional units, +but does not cover dynamic behavior including frontend and cache. For a bit more +detail, see +[https://en.algorithmica.org/hpc/profiling/mca/](https://en.algorithmica.org/hpc/profiling/mca/). +chriselrod mentioned the recently published [uica](https://uica.uops.info/), +which is reportedly more accurate +([paper](https://arxiv.org/pdf/2107.14210.pdf)). + +## Correctness + +Q2.1: **Which targets are covered** by my tests? A: Tests execute for every +target supported by the current CPU. The CPU may vary across runs in a cloud +environment, so you may want to specify constraints to ensure the CPU is as +recent as possible. + +Q2.2: Why do **floating-point results differ** on some platforms? A: It is +commonly believed that floating-point reproducibility across platforms is +infeasible. That is somewhat pessimistic, but not entirely wrong. Although +IEEE-754 guarantees certain properties, including the rounding of each +operation, commonly used compiler flags can invalidate them. In particular, +clang/GCC -ffp-contract and MSVC /fp:contract can change results of anything +involving multiply followed by add. This is usually helpful (fusing both +operations into a single FMA, with only a single rounding), but depending on the +computation typically changes the end results by around 10^-5. Using Highway's +`MulAdd` op can have the same effect: SSE4, NEON and WASM may not support FMA, +but most other platforms do. A common workaround is to use a tolerance when +comparing expected values. For robustness across both large and small values, we +recommend both a relative and absolute (L1 norm) tolerance. The -ffast-math flag +can have more subtle and dangerous effects. It allows reordering operations +(which can also change results), but also removes guarantees about NaN, thus we +do not recommend using it. + +Q2.3: How do I make my code **safe for asan and msan**? A: The main challenge is +dealing with the remainders in arrays not divisible by the vector length. Using +`LoadU`, or even `MaskedLoad` with the mask set to `FirstN(d, remaining_lanes)`, +may trigger page faults or asan errors. We instead recommend using +`hwy/contrib/algo/transform-inl.h`. Rather than having to write a loop plus +remainder handling, you simply define a templated (lambda) function implementing +one loop iteration. The `Generate` or `Transform*` functions then take care of +remainder handling. + +## API design + +Q3.1: Are the **`d` arguments optimized out**? A: Yes, `d` is an lvalue of the +zero-sized type `Simd<>`, typically obtained via `ScalableTag`. These only +serve to select overloaded functions and do not occupy any storage at runtime. + +Q3.2: Why do **only some functions have a `d` argument**? A: Ops which receive +and return vectors typically do not require a `d` argument because the type +information on vectors (either built-in or wrappers) is sufficient for C++ +overload resolution. The `d` argument is required for: + +``` +- Influencing the number of lanes loaded/stored from/to memory. The + arguments to `Simd<>` include an upper bound `N`, and a shift count + `kPow2` to divide the actual number of lanes by a power of two. +- Indicating the desired vector or mask type to return from 'factory' + functions such as `Set` or `FirstN`, `BitCast`, or conversions such as + `PromoteTo`. +- Disambiguating the argument type to ops such as `VecFromMask` or + `AllTrue`, because masks may be generic types shared between multiple + lane types. +- Determining the actual number of lanes for certain ops, in particular + those defined in terms of the upper half of a vector (`UpperHalf`, but + also `Combine` or `ConcatUpperLower`) and reductions such as + `MaxOfLanes`. +``` + +Q3.3: What's the policy for **adding new ops**? A: Please reach out, we are +happy to discuss via Github issue. The general guideline is that there should be +concrete plans to use the op, and it should be efficiently implementable on all +platforms without major performance cliffs. In particular, each implementation +should be at least as efficient as what is achievable on any platform using +portable code without the op. + +Q3.4: `auto` is discouraged, **what vector type** should we use? A: You can use +`Vec` or `Mask`, where `D` is the type of `d` (in fact we often use +`decltype(d)` for that). To keep code short, you can define typedefs/aliases, +for example `using V = Vec`. Note that the Highway implementation +uses `VFromD`, which is equivalent but currently necessary because `Vec` is +defined after the Highway implementations in hwy/ops/*. + +Q3.5: **Why is base.h separate** from highway.h? A: It can be useful for files +that just want compiler-dependent macros, for example `HWY_RESTRICT` in public +headers. This avoids the expense of including the full `highway.h`, which can be +large because some platform headers declare thousands of intrinsics. + +## Portability + +Q4.1: How do I **only generate code for a single instruction set** (static +dispatch)? A: Suppose we know that all target CPUs support a given baseline (for +example SSE4). Then we can reduce binary size and compilation time by only +generating code for its instruction set. This is actually the default for +Highway code that does not use foreach_target.h. Highway detects via predefined +macros which instruction sets the compiler is allowed to use, which is governed +by compiler flags. This [example](https://gcc.godbolt.org/z/rGnjMevKG) documents +which flags are required on x86. + +Q4.2: Why does my working x86 code **not compile on SVE or RISC-V**? A: Assuming +the code uses only documented identifiers (not, for example, the AVX2-specific +`Vec256`), the problem is likely due to compiler limitations related to sizeless +vectors. Code that works on x86 or NEON but not other platforms is likely +breaking one of the following rules: + +- Use functions (Eq, Lt) instead of overloaded operators (`==`, `<`); +- Prefix Highway ops with `hwy::HWY_NAMESPACE`, or an alias (`hn::Load`) or + ensure your code resides inside `namespace hwy::HWY_NAMESPACE`; +- Avoid arrays of vectors and static/thread_local/member vectors; instead use + arrays of the lane type (T). +- Avoid pointer arithmetic on vectors; instead increment pointers to lanes by + the vector length (`Lanes(d)`). + +Q4.3: Why are **class members not allowed**? A: This is a limitation of clang +and GCC, which disallow sizeless types (including SVE and RISC-V vectors) as +members. This is because it is not known at compile time how large the vectors +are. MSVC does not yet support SVE nor RISC-V V, so the issue has not yet come +up there. + +Q4.4: Why are **overloaded operators not allowed**? A: C++ disallows overloading +functions for built-in types, and vectors on some platforms (SVE, RISC-V) are +indeed built-in types precisely due to the above limitation. Discussions are +ongoing whether the compiler could add builtin `operator<(unspecified_vector, +unspecified_vector)`. When(if) that becomes widely supported, this limitation +can be lifted. + +Q4.5: Can I declare **arrays of lanes on the stack**? A: This mostly works, but +is not necessarily safe nor portable. On RISC-V, vectors can be quite large (64 +KiB for LMUL=8), which can exceed the stack size. It is better to use +`hwy::AllocateAligned(Lanes(d))`. + +## Boilerplate + +Q5.1: What is **boilerplate**? A: We use this to refer to reusable +infrastructure which mostly serves to support runtime dispatch. We strongly +recommend starting a SIMD project by copying from an existing one, because the +ordering of code matters and the vector-specific boilerplate may be unfamiliar. +For static dispatch, see cl/408632990. For dynamic dispatch, see +hwy/examples/skeleton.cc or cl/376150733. + +Q5.2: What's the difference between **`HWY_BEFORE_NAMESPACE` and `HWY_ATTR`**? +A: Both are ways of enabling SIMD code generation in clang/gcc. The former is a +pragma that applies to all subsequent namespace-scope and member functions, but +not lambda functions. It can be more convenient than specifying `HWY_ATTR` for +every function. However, `HWY_ATTR` is still necessary for lambda functions that +use SIMD. + +Q5.3: **Why use `HWY_NAMESPACE`**? A: This is only required when using +foreach_target.h to generate code for multiple targets and dispatch to the best +one at runtime. The namespace name changes for each target to avoid ODR +violations. This would not be necessary for binaries built for a single target +instruction set. However, we recommend placing your code in a `HWY_NAMESPACE` +namespace (nested under your project's namespace) regardless so that it will be +ready for runtime dispatch if you want that later. + +Q5.4: What are these **unusual include guards**? A: Suppose you want to share +vector code between several translation units, and ensure it is inlined. With +normal code we would use a header. However, foreach_target.h wants to re-compile +(via repeated preprocessor `#include`) a translation unit once per target. A +conventional include guard would strip out the header contents after the first +target. By convention, we use header files named *-inl.h with a special include +guard of the form: + +``` +#if defined(MYPROJECT_FILE_INL_H_TARGET) == defined(HWY_TARGET_TOGGLE) +#ifdef MYPROJECT_FILE_INL_H_TARGET +#undef MYPROJECT_FILE_INL_H_TARGET +#else +#define MYPROJECT_FILE_INL_H_TARGET +#endif +``` + +Highway takes care of defining and un-defining `HWY_TARGET_TOGGLE` after each +recompilation such that the guarded header is included exactly once per target. +Again, this effort is only necessary when using foreach_target.h. However, we +recommend using the special include guards already so your code is ready for +runtime dispatch. + +Q5.5: How do I **prevent lint warnings for the include guard**? A: The linter +wishes to see a normal include guard at the start of the file. We can simply +insert an empty guard, followed by our per-target guard. + +``` +// Start of file: empty include guard to avoid lint errors +#ifndef MYPROJECT_FILE_INL_H_ +#define MYPROJECT_FILE_INL_H_ +#endif +// Followed by the actual per-target guard as above +``` + +## Efficiency + +Q6.1: I heard that modern CPUs support unaligned loads efficiently. Why does +Highway **differentiate unaligned and aligned loads/stores**? A: It is true that +Intel CPUs since Haswell have greatly reduced the penalty for unaligned loads. +Indeed the `LDDQU` instruction intended to reduce their performance penalty is +no longer necessary because normal loads (`MOVDQU`) now behave in the same way, +splitting unaligned loads into two aligned loads. However, this comes at a cost: +using two (both) load ports per cycle. This can slow down +low-arithmetic-intensity algorithms such as dot products that mainly load +without performing much arithmetic. Also, unaligned stores are typically more +expensive on any platform. Thus we recommend using aligned stores where +possible, and testing your code on x86 (which may raise faults if your pointers +are actually unaligned). Note that the more specialized memory operations apart +from Load/Store (e.g. `CompressStore` or `BlendedStore`) are not specialized for +aligned pointers; this is to avoid doubling the number of memory ops. + +Q6.2: **When does `Prefetch` help**? A: Prefetching reduces apparent memory +latency by starting the process of loading from cache or DRAM before the data is +actually required. In some cases, this can be a 10-20% improvement if the +application is indeed latency sensitive. However, the CPU may already be +triggering prefetches by analyzing your access patterns. Depending on the +platform, one or two separate instances of continuous forward or backward scans +are usually automatically detected. If so, then additional prefetches may +actually degrade performance. Also, applications will not see much benefit if +they are bottlenecked by something else such as vector execution resources. +Finally, a prefetch only helps if it comes sufficiently before the subsequent +load, but not so far ahead that it again falls out of the cache. Thus prefetches +are typically applied to future loop iterations. Unfortunately, the prefetch +distance (gap between current position and where we want to prefetch) is highly +platform- and microarchitecture dependent, so it can be difficult to choose a +value appropriate for all platforms. + +Q6.3: Is **CPU clock throttling** really an issue? A: Early Intel +implementations of AVX2 and especially AVX-512 reduced their clock frequency +once certain instructions are executed. A +[microbenchmark](https://lemire.me/blog/2018/08/15/the-dangers-of-avx-512-throttling-a-3-impact/) +specifically designed to reveal the worst case (with only few AVX-512 +instructions) shows a 3-4% slowdown on Skylake. Note that this is for a single +core; the effect depends on the number of cores using SIMD, and the CPU type +(Bronze/Silver are more heavily affected than Gold/Platinum). However, the +throttling is defined relative to an arbitrary base frequency; what actually +matters is the measured performance. Because throttling or SIMD usage can affect +the entire system, it is important to measure end-to-end application performance +rather than rely on microbenchmarks. In practice, we find the speedup from +sustained SIMD usage (not just sporadic instructions amid mostly scalar code) is +much larger than the impact of throttling. For JPEG XL image decompression and +vectorized Quicksort, we observe a 1.4-1.6x end to end speedup from AVX-512 vs +AVX2, even on multiple cores of a Xeon Gold. Note that throttling is +[no longer a concern on recent Intel](https://travisdowns.github.io/blog/2020/08/19/icl-avx512-freq.html#summary) +implementations of AVX-512 (Icelake and Rocket Lake client), nor have AMD CPUs +required throttling for AVX2. + +Q6.4: Why does my CPU sometimes only execute **one vector instruction per +cycle** even though the specs say it could do 2-4? A: CPUs and fast food +restaurants assume there will be a mix of instructions/food types. If everyone +orders only french fries, that unit will be the bottleneck. Instructions such as +permutes/swizzles and comparisons are assumed to be less common, and thus can +typically only execute one per cycle. Check the platform's optimization guide +for the per-instruction "throughput". For example, Intel Skylake executes +swizzles on port 5, and thus only one per cycle. Similarly, Arm V1 can only +execute one predicate-setting instruction (including comparisons) per cycle. As +a workaround, consider replacing equality comparisons with the OR-sum of XOR +differences. + +Q6.5: How **expensive are Gather/Scatter**? A: Platforms that support it +typically process one lane per cycle. This can be far slower than normal +Load/Store (which can typically handle two or even three entire *vectors* per +cycle), so avoid them where possible. However, some algorithms such as rANS +entropy coding and hash tables require gathers, and it is still usually better +to use them than to avoid vectorization entirely. diff --git a/g3doc/highway_intro.pdf b/g3doc/highway_intro.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e051a2c63199e0680ebb28a3fb6f8b1088b13db4 GIT binary patch literal 1313237 zcmb??1#lc$vaJ|eEQ`TnW@g!9W@ct)X0n)NF{35fVrFIri^*bUi@)r#clOO}yuY#g zq9Zz@Z{5zy>aIL_@>C{?te`Lr11&Rv1b=gDd>X(Epavsr%=*xlYr`LYu)Kc$?T>)Hk)ETC zJ%IfA1xN(Q=JjO)WC>(M@x$D&?X3T_{SWtlxb?$1dn4-~r~hG{PRhv8Opo8j^{0RI z05(Q?05d(yt6y{iHr9@>8yx^_f9#YuaBYU;;D8#$X97|Dz9|8Vh7FMk;SeNn<@mX1dD06JkyJx3!!BLf@5 zS9c|itW6wE0USS1^7=go$Ja}@0=TBvDoV+q@F8Z5EsN6h_4H{q!cPHW1B+CE zA#$8egdRc|b8D4-RPm~zMY!mU9mK4Lg*rI8DM)wmlWxTN6yY+09%PhVJVRM|G%wBx z1N(Oj3$vH_0BoaoflsKeQS%KS(?h=PCnqbb4+TzfG$PJ4rt_5Z3-6cA16w^lsM@40 ziPcvl?{^>)vy*gWI`US_iM}kx=d}j7fUrvI)2QI8oAwer^?}sSc?hs1S(G!_RYyd+ zCr|e^e7N@Ksu1aX5nV;h4Wds9W405pD1#c!f*Z6Pu&5+xxJ+&en;nZ1xs*v_)W6 zZyUivQdUOy_TO)dSLL%KQmUyEEHWIX!kZw+uhbSk`lR zP7R#z1++@4jtWD#IXU-|W3`?Vk+MyTTuKg2XF#8!+?VW#nHYsv*mE?KZooRessaCY zKpy-{jyE9sV2P__A{C3GJl$iJ5B&O}VhwN6VY@-M|D0Vt;5@3@S+hZC!q9T2o86n- z3Up`vt!sc^r6Q)09ZE(%KYPiN-v&A4xL{37*M4m$Y_Obluo&EsUX$CFKQ|+niRPTZ z(v&K7OfTg<{6IcK*9h@9hxW#L`9xiE7b}!34Grt#wNvL>YuQ*@EeY8=_C1(#4C$oZjsj@>Z zp1n&d+#FW1kmTE>+vemXG=D zLH%cxGyD;9bicx!@fTqHXJP)+{J#_DulVx|_5Km)y~(52*ldXXwazkvv=A2niu{>E zj&4kE9F2VNMIdl2C@BA2V1Lp|qye$xZ?Y%@*jEJi_)jg{!1p*D+v^4)nx}CV92C@M zsB~r9&n|;GJN=kr2a0~Il(>*q0kFe&{coH;MX*BT!dQ~Ov%6D&D_-=*2-m~J{#K~* zeRrn3K@P1}{8auH5edRY(AbnR^Dq%xp-%7FHL8>Gd_T1c@g&FSSz!ONR0%CRgu=? zRJb%5+<9?54zNgLP{l1*K*LcegbIce!7Z6bOYGyt7*XHXs5w+{l-3A|BQ&Ib({Y0c zoRZtQ90s=TWnLJZSMjZ{rp2(P^sLg5ygkC^Vi3G-r%$qgm^CAv?IGS}rMI1BCQ=fj z2PSr4Dp6q(Yz$cyUbL%w8lTdm5?(^)txyd!)QGwk8FQx}n#T7@{mb!K{b~OjZOLe~ z)F7cVG%sVhZfbL>HxV1_A8p~kIl1P3BW516iZ7a2zAL@lGbYFtzS_*Ekp-(jTt@B_ ze`_l!@-2yD?m`_W?;{FI#1dnFd;f+hi~AW7gaIXyR2?yik;YMeQy8V09Ig-VW6Z?W&NYk%K4StX%vGlsVVf%|7sBMQ+=PL0$;o)6W-& zt5zlfFt&M0lpdZzo#0kK6cMqSJ`L1aqhPETCPMgHcc((@p|a0uHi*7%xNcE zD(6^-T6!X+StMP3w2jKexoDqpGp>5gO2o8q4#QHB?!6xtFrwYV*|b2=piI3&Wf!@J z=CRmwYNeFO4?k#>Rc+~rqjJm(K6duo}R~X8eL)?+C$WctjC8|f)E=v3_7L! z;jZpO{+_8?p3*>(i5ua1cl0p*3a|exdVZS!?dbnjRxtg*gP%p)Kj_A>>QB0{8e}6#2R?NJEw5@O=@9(# zo{nZ2&1nM22^_347RYbxJlnf&^n zpVFI>+fkd}wRv9`p^5=6(YKUP_YGQRVzv239*mfLXrp?ZFI^t)Br9U-(r&9~?3=xo zEo)sC1B@+%E2Vr|EpAe~cTRFU=jxfx1xIds5iX_Rki<}H(7J}~tjBvENw^+FCExWi zREj=9yti_XV#%IFhA5^&WIrs!BtMa&I#b}7Q^q${W7p=rmY6iD$kQ^_1_+H8E(8=hk;MVanL zQg2n6fK+(<4ZDX5uhgLHabIJjS{KGZ4y#q2VuZ_g2&v&W8xeH{V(zUi(Vwc+HvIE! zuyJX@-KDnWcck^1h~Q8i^9lA6&8t5YUJo7EZ(~g#M$W@brBJ9}p?*b;*-?RUk}sEH zE!8+AYvNhXw+`8tTCT9nvOif;i!*+}mnhEf8#N!iV71y~u4BGqsWlXsluywk&n4Hz zp)x6XJEw%4!3s&j7sJQZ0XPEeZHaQmn_E?>kxT;Hm*qEOkNS3B9Vv?C9n}n|B#gm_ z`>rZjH(?MvS2)#dXeGjWPUum9SsV;(8^#@Xus6kqD_5@pDd_YbW@@S706hz=Zt@cLsn53t3dPzOk-d zZvu*x*1K7Qg&ZAc9Q|@1-b@UpSpy@|oGPy7Ua8XBXU{bb@;M@Me63NJ%c~JX7}iw@ zfTsvi!+L+^cM=;GqHh`%2Z5ZoE1^X`Q8}^b-Ug8hB2Qd%=_os;Bq8NrpqB5fEQcR6 zW~L4)Rm6Uy*E+?>f533oi%b(O>_J4Mp@1>{2|U_y7E(dLjj$iJc+#8w$}fbKJmt@4Ch8>6T`8rcEGe`hk;^D@U(`5({GKaAN&#(&&Ths4JS9)hnh6s*&5TYV)oB`!~9FO`o-3meu$5k%f z5J_7C!-`cXTnopwN@OowdZWl{Le{Jubi_V0N=dcdAn$jEE?*6p?FX#An7!`LGIbH! z!VSc-Ob}>i9|Y=rGQ2`%iYz(REra7*X1wSKaX3C!NFkeo4x*O6B%qVS_8u#L_(Tyu}h;jQ&20 zJcRxu+BC~58Bzf_K>ltY7~L+smDLDo^uQvAHFvZ!G5o_da)Py~5^~LERZt;j+?;Bj zhH_dVxT~6aV&@>G_C%_%>yQlc2S`+(`_-d2No!tv**;R2iW(H-fvOP@5eqqE)8W-`9xR}^ss%(`h?4W z<^gJS_iXykPVR}(`12&$lNaR6vw)M}c5F_O?MaXbRB~#;EdTXR>t0zTW<**Upx#rd zp^eg+zRhNHR~UQiy4GA!_uBrmtaX&x2(jOnYT*j8Yeq59xDVDd8YQYj1$>$HtqLNC zQv1CX`XE6twJN&qW4QcM-nJ}vZK+Wc&1|3IN!gqRT$R?WS)8ZtZJSqxLF(fj>g~Hf zUfR#~8i8kOka$evF{*cukaU;A&W{ufmsuFp4?5UQ`6N8Bi4iPP~Bk$j)&)1on76)X48T{iiXriTGv!IKZ4(|ZosjJ15{3ctpH5x5(z_9 z+7(n0dQp$J zl9#>{te%IRN0|@$rE!KNfwW4+x8YDJ$>Kwv8J|`Dw^S@H69Jsas=Y(@4rfB>MXI9o z4zvAKFk{F%QUE!e5naEc-IuRm-yObL#*5v2kfZnNbUp<*w;7@4U6szYa|9VYDZIgUyV?BI2z@Ni8`%72r3?z&Y*(nvlFGl zP7dHI0Rh?0)IX1kynvqpC((tte5OYb_m6mSrF4}|w(Q8-Auw3jZ9FFAcIB2Or?Vwd zWko1`&2LwapaEWOfY}RZ_NK^YQ$^Z_7^pR`G2Yr%>R`d z@zeYd=l{R??O&)7suDJ@`R!zEIaM>plNcIV*X_yFMX8-A5XCU2BaS3Dkr3yNA#f#X z`I{p_Fhrd1ZyvGW(>ZoszW3-wE8ZS)Gd72B=g;R|&&)WwDEX|-NAEiT%ZEB1&jiaQ zovdYqI^^WbBWhI)eM9axp-iwg;paNc7@8$p0i`^YVkFf+i+RopgWRM5=Rj}1C4HCq zmdfsedw1%QR(iGgTH)MP!Bc2H+l(Yn#6q7n{d|aoiX0b769B>cRo4)G+JMg3doRdL zf(~W{ECh#J2L+5su)q)zk=d1|T)Hy?ID|1d(`D*b7&W52RST5IgzoGU2=~_!OEtk= zlDWrISCwu?ouK^6hu($rK?La|p5c-wI!d%iB1RlO+{xh{AXU^n0+YkED^FfQo9Zv6 zOLywr^0K&CK`zQkE|)PUV`7fVYDgtCV2-0uzWJCz3t->>CR_f1|4t7m_V`EKLKR#$S?udQ<4t(}z&w3OqNH(~FvLUH4HbfQI# zr3fk(IBlp|t%fRx54K{z3rDo0WJOABN9Ehk1Riz0wT$TbE*9ntzNk_u^C;g8ESr|< z1!r0lILETDxn5RXNYeg!(V>mvVG#gE^Kf+NTsT3HO{zy_wiA;FoL?~J15+XrBQO_z z$D4Lx7XJ6iL%Ln1ka`!JhvNpGf)UnSs=9Q|q`zH_!VTZYxC zFK#|W+LT*34k6rgx;<#ilc-}mVmQ|(icpX{Y0kA>1RI9{l`I(Mvxsljf~vmjpC#a{ zmufGMil6T^eyQsqedptXP|13^O&=NWj7Q8DRVnbV6lsAoPu3=Jp=2d zq0M34xTwzdS0v}sT+48!5}_0G_vjb-1sGLJl-^piid|5Ov70WgQqV;h@Fd6gK=Q`rKcox3pIuf`cYnHenJ2IL?nM6W?*!0ZYjWXHp8aS)V@X^eT8b7IX)4DXgfsi^lB}rRM?u)BWsPiDXPudqrRDIEYDD7s%OOe7t zBk60c_%ZOgV8;XBeX8n`MVFg#%`i3|E-Nt@hU3~5SX`J`=R{JkBX_(E#r*`4$zYw} ziHru{{V2_;YrOAVrF#DYVt*@E4D=lT5PW}P++QG;<^N2wV)>h7#qvKTE0+ICvifQM z-+@?`ze`rNuaeadh&{J8Cj^Tt(BvSnv8-<{eF6W7|K49TwhD`a5KI>AE1Ei4<%aJC zYzq>|XKr5|qSvQmL`pnd)2&R+QRyQmuC22y(^^{BvujrzD*X0M@ETpB;qC7-&K+oj z5-6K=3&dn#`@bVTrQ02sxiCuUd;=CeghPVi#h7?ec#e4|M6mm~cd~i6|7K9eTf$?% z4V}!U_MYNhTQ}Z1i>i9qUT11Dlu3)DAb1H8xky(60xL$Wkvq9zAs<1Zk9+){bc1-; zbPzxOC-MR)P4My-H05}Dg9K0{$xf|=W%7VZ8a6svdYH~78YtX&x%HtRca61e5Rg9! z>~5rppRB;`&2s>ci4PNf1_x-O1BZDE^XKNu4S5iT+LBJJzO%6n-SNlsx<1-JO(Ih< zjZ(>@+?;pKA@T;XS*$Sg-dQ{6adt_vK2@4PLa#MpX->C`556?#-3kdcR+;7XSXlX< zf1mek+1I{1%Jw*H{HfI(Gf3Wau$F5z?R-Zi?grwp4ce;b%{&~S#%yv95P|l|goaUq zDV=_)LkS!Dz<^w%&}@Ff^*)d? zJk0OC5f;=Y2%X5#jCya4lN&1k5-{e+ib)WH$nZ`Eo`+nxU(OJn? zaeLOigIxlr{#K!Pv0UMuUquHhKL_qeD4~ux9Fi6u>JIkS?P+Rn;Rqc0 z+Ro|Au)D~E2`BXuK5sqL2VbuN>uTH^q@~_i zVC=ZPmf%U}fsKQM7N)C~y6IFbpa9U_$dQSXd6tF}3|gAG8*X?H>=Cq?*pj*}3U+le zX)ZmZ3Qx4dbz(qGcvmoSTU#L=n?I(YK|6zN!4~{1pY~aScxiIjF%9S-3m1i+E>gv!PXt4j)bjCYNR}_cd@B_aH&p}KUS6cy5|0yXvqmE_bxi1B z?}=wt`qVl3{LjCx1t{-ZSu06;aTjE>Dt0w!e__@ruYDQTQqsBmq_Hm*%()cY$|l;x zoZmIiXfJg9VV*X7~t~QtQyxnG*Rlu3G<>cTow0-!Bi!l*10XD#W zg&CZo{;T#y2+jiT?jP&j%NRT5E+wBoE@Hw>2__VItmdv$>=Nd8rwe6jG7c-0(@?i@ zPl7-N$a9+{S!Um>O`jJ^uGG>@>tyH%_NY^{_Km20M%6D@dL#OD{C{UuW{-wtry2(e=M4t(c4GhGaqT=c5}bV%gN%2 zmS|?BR3{J6#el6vPXY&f?{JDr8E>7qUv{Z1qQzA=Ti^#|@8ZD4Xz=O;rCEEr@GJFDfw` zaRKp*rmx3NlweU(+?*ZS(StUJq<1j;iV{m43cSlQ73pTBUh@lrF$58NvcUQ6L%s;u zZ(PXDwjwI9NRSlgR&bbWX2FjK^Ft3>lG*$$1$gK__RhoMV4vWDeP2vQs8c`0gg>up z{KqEA3;DAH^Jhw4^z>5kgIbnM%Qqw0qFsie3*GG+;zZaz1 zP~apXr29f$H2eX8vEbxAu?NxL@~GnqH}ORn)lG_)k+7%(LJRUsFF6it6!b^C)p!!O zD)eVwf~1Bfr!@L1jqnFB*#?oXQU;F4?tweh_J#H7e4{g}#nM*9LJh$^%X*xy1y1A2 z9j!fKPK)?$tANDsVKfuX7cZzSV0ZD zc7lu*`WWB=-qd6%w?^O-_ilirPcJyul}{Tsf{pish0(nP_Wc=Zl-m~8gvv|_f}8D5 z)rkdMrffpv;LtO(EVr9`%m+~9sS>=ch)}yk$-NK7*&_BK@maBytc>k249`>2 zX-0Xz8oB;E{sBVH9nQ0!bO2UzU=U|r5OY=Fv!Dm_#4u4L^jLqFZpc<8x5nmj+v<95%~~75>mqEvODyGzKDF6;T#c4aI&e8%g&g&M z*Y3gb0^p4-VPFgcSZfN{2NhVEjlhyj(F!M^rRG%>=1!)b5UTgB@mTGf+ijw`;Hd^; z>C+2ylBFW`#^^dTjD_ZlUKYEMov^4n$Xx|P=Z=|fw4pV)EvQMzQljPtJz) z_D0V3a&uF~OM##wfZ86^L5|-j2uR-xLSddU9kIZ2nw5B7`9_9}zSoeh{o*b!z1Z1>l4;-ULyIMNnx@x3%3-o0Kp54JP}JtLz}BV;D5bb(Vb1R zb#>WT%NYvRJ)J$uKDdFe;Br*a?J}OU6TqYlFc&j z<(yC*eCihsXYFfg^Ypa<|BTVI5hm|WQS80sVx}#EbHz+kzZ5=T9v48RY^!7N25xQw zSzh83SWWSY%~1Vq;vOi^+dh!2RfFWp*y)Od z;xV>9-B!DaYdD)evY;(NJN$zEsSk-P2zrR+mK^LFQhdWXbNZnatxPyG-J|>Ube8H< zh<(`2w{nG4ij0?&vit44@lUBMpZbnzTT-Db1fU7?lftab=)(5bz)JC6Yq>4-n*g@f z`R)OS$$1O#G9S?BPGB^ls{#0j06JfPiadGcG`jEhW{ABJ1%kEGC>$vtyG<@F+nJNP zUA^8Jz4w}8Sq762lUP8hcW8lTVOnK{&{*)UQ2yYUMnDDESKBgU1cHNVX-e|I#D&nY z^4S^^LuZJ#V{MIU*F8x~<2xjpoeB&cn)YpSKSq~0ed^S%CxIVu$Xp*WSh@ec?n2N^ z9-(C-MIUKG?U3l!d&iH>&R*7L1wHaCBuC5Fzy{S#<?rNZU($Zztx0$3d3XlgF#L z^1h0q}LbW=HsuIB|D;&B#!)qCZU0SkUAg` zA2)5xYl_S!Rd5ZI6a?R2OZjoRXz;u9<{)woBW))#^(URyc#KFo+ zYhd|n8&CuaWl(g(k~?RJSC}!J@%NYdD6H7x*eDwX znFmA&8;Aiiw<$Mg2#tX-@{A$XQ1xX2OB(i z_7qH}1BZ5D9fS$Wz?CS}V3el{5tVN<-M$k^;$w%`f+`fKrzIt?*(Sdc6E&IFm%!?YiZXT*Ml~EHNmwO6wq@0C&!^@ zrZxT4=4 zwpl~}`sqAM@q}}ya`UXIHgnC)K>;0<-b#u6DOWC_u_i3GzbZ@GZvYxhh~G^@X9YCq zCcPLlE3R1Px~*VSb!jj$cU}EtTeq77XCUEpDfIrsOfq6mkjHKn4vtIT>Dya+x4Tsa zy*OL0mWdeBt|T!qLr*b3PH4d3vMzPZ)W(WIZr5=C&E-2LTL-^?L4$q|$1u{rHd_8_ zrvJ%9{z`-X`N95wMT6LW^&9+IhqC=DJb%$3wqI)Ee@26Tn*Vob5ZfOz=0DS*S``@^ zMK;9S=^I0Lz9EwjRm67OTQB{J;Gu*BsO5nG6lgv%MJrz^kG&w!tU#N9^|5(VX^wAS zE7$JFPc)EGTUUx!aL21n`YzKjqtS5!znya&LDRrJwR3O5yh?|NWo+_An1=?bC@*6T zRefqHN#ln=1$Las4sEO-+KV0;d<4YN9`{_IPp*Q#k@=8lT--i>YtGNQn0NX9<*F<+ ziIAO{leo*A%qqV{5x7&>XuaiJT`>b5JHu?En4-G9fRidDpPMHi3>1taXm=fzog^Y` zs0Jdu>oAzhrjgIx%H0}uf*!kfN&f8nt8$31PFB(XGO@5q(@lbcC2~8io&KnIH*OPe zHt*h>T?ZCSq5|niNC+exoE@s>dKtXyX-j&3usB5Z{6Rc%97($bWz`Mwr5x2Xl9@Y( z8mxu`6u&Q`M-UutU6YtduQs~wg_#Jc+QHlL^f;{@P9}%mqcv5@uWb<;b#LpgOeSrC z>N(-Ga?8xUbzYBjj4Vp(t6e?QHYbDkk@ZIx34gEAbf3aU7v6`UwTwt38hKyD+#2Wj zQ4a;)2PX68&UUs9sSk0Q2*r-k^&m6i@AXEhV9F|m(HkBL;4VGRm4k`MCpT#_3h^)B z8xrc5ykWP+-Ht$}^kU78#Gmtf?r3vC%&5I@n|&nwy@Qb8&^I=jb^e$+VGR*X{1u-c zM@Zwz?7q^ucYJONQLsOz{5*Q$Pc@hWe07MHaxjngIrY7G_4QHQ=0Ub%_29If0Z0kn znYf?2%fVjuNSsuCpE2@%K^f|)Mr#_A^AfDCc5sJ6_Ut3APKzC(eS0&MN>wfg=JzkSH-*DGz zD6^-+SaU8d&75GMS!)akX5> z1!FopBp)Tcj_K>qL>Q2awRTy|jfJz&Rs-_8At3HCW}>r7IX1FzoE)}5!)V&Z;`5R| zC7y}<{GPYYp%diVIYHn9FBIJMC-An_+RaP6L!=AhzvJA`VE=&%9IyEB3+8^}%wJ%R z?MFrS8$JFznEUnE`JZ6!S9$jv=6*!o{`@Y z1fpAZQz~1Tpv|V#oNJoL+bf@rZoR$7Z;;|O>D}wF`CtNGba?bc{?TW3HLM!?ocXKI zY8M1*iy7Ksj%wnRz@sBGH;$;z!RC2ogQyd}E!tqetsj!@;3f z$!|=88S0Bvzz#g1Lbin(9N|RR1{3Izo%wmqxsGAD0$Sc+cZ)0zbZd($+uwE58uwf7=9$R&&{NCn-MHE*fj^q}D|JMhR2wghv>9hrM;>Um zvKk5IN8UTPaz;9_9c(EVOLKm6V=0+GtUag!J&~DAlfsRw2TpZYxFJb9u0k*|hdAZ2`gq%HaYkpH4V~I@{p*9k2-$R z?%H7JJs!{*WN4d@kNa^-J-*Wo*ndNLvq~Ta>n2R-4Gs%@a&8$P1eQeX5bMby7eJ+K zlhT{^8m;dZ4NvR&L#s-6mWAH8+VK}DL*bbn-1(f?wJ2De8=RSqAUN)!r3nHFx7qaHpg(`v zF6Q6OZJvs2ih!By?XY^J9%0IZagbkrv&RX4Gb}`vx`t7m=dILWs7-WjqnWici)&~n zQ4=L3f+@~SJXUAnOUXL&d5odw#EEY1pj`0$K#4`|&SCquZaDFs@-t=LQQEk0Ziyg0 z|Mf0&(NY72~eOD5-bE_6dduku7L8!j!#|ytPbaKhg4%+i9NlJB86p z$h9}ya`io~c#nZRB7Ia&QS4iY! zQ%|z@>lnN3Oax+qjsaF~Z}PhMJ1|_D{nhYI%OMOjx{g}0NOz)4#10zq^I-PMWkejC>2w42pf zVBJ@2P$mxBw4AHMTdiMnl zb@{oPk-wrf{ncbOWwAsaCi9XPx=WN?5P^|xHx`#o3CWY79$%ovbM z%onACf!j}4k1*@?b|3ylPWgeXKXL^F(`(P}@1*jZCu{s{5&9#A{6zfMS$8LW$DiZ# zl4jNxKL(}@9A9TxSy*YAnK|eg0E~>Rv`nuv=PUq521Z&IrdM^4k%|4+((4>Lzm2`& z>s0uU`S~BO*A>2hK8Jy$nT_?&v3ezWF#z3_2^F;Gm~vXJTPz0x)o} z{aX94owd~a!&w&-20C+FBNG}|%io^spxkpF!DH|M1w!1Qq+SBqlLL|pWu-39 zes1}JNv1m#RK(c%a;p%v{FAQ5*7?V~r8{G1@ICIYnehBqWxEd~WgT3nm7>OY9+6MY zlna%C_~|{jJ1ZyrnF`D~0j=RrwCgCe%mV=9@uu-`CWK2mFWR(S^Jn~181{F;Z&pf& zI8rY+SY~PDE<`^ICdUb3@!Q(D1vsMWx6u$%G*y>PQhwXe%6fZibg;(7iCN5;7*WU= zNgIUInm;5Rn$oJBIr`4n)o#J$X51cydbJrg8MJVnU3HNmH-@B9pc#vXSLE$P#_L2* z_(c{-nMAi>91a{0hg96a;#T2c9p$#RlHO2fVh_GNhL&h5bC|Al*h$A{5Qd)I60pkj z`Wu#}@~|LyHzp6R61X|qg)$5WyKbgaRIs)f$Y)UBtJ(H~O&Q);l;{L+ z`j@*>JrQsu9M0f**#JfQ4o*g=_J9gUO&aO2uR!_upjo|u=fp^4{2i5k-b6p{HkQ}M z$6uiI^Zxy@_D>V@?Ehy>m;G-{m;HY-UH1Qq>HakTZ-vjlV7h7&Hp_}AZBcp3nv~h* zlmv>k*%t%ExVvq5AHapU1*s@muu9D3DT74RJM&2jKA@GrD1Ug@fyR2qL|G|N_P~~^ zgsx`)`S<~YOTh&Tp)w8!4{y!EtL>DVyE@#AuYxN$5&uKri_XcqmL!0`SVjhbwsWW~x*vtgU;a;N*u)ebgW<pfws80Y2m4o`xA0|2)ujPbo|rtJz>DZkQK<1tNZC~3mL#)*eJl8D11 z{_%|7jtItB)63KsbzlV@vmxX3UcP?7nG~p-UzDgZvtc0Q0(sN(y_;SpU#!kpHu6o? zmWezYpg0Wk8z z9c=ygaYN*VB%&iFH7U4srovJ!pW(O>30fB1oe@A#WCy+)R+OeoRw2P2`fmU)M(^za z^qbI2hp(Nx`T|w#I9y`2#Js{pQY- zhe`VIN8_h*$%#SPb$9r!yRF+H8aggzGG=-7xisiC&u{(td?Vyj>otM*KJ#%Ja@$wp zk>c||+!J?UBSwljQk!{J-=sbA%cF8#lpf_Z=OibNVUbY#;0=8xRTspLIoUJ0dZYOX zO(!A%NOOqI!`e|DHH>09%Kl44!nR~z;SFal_=rYuQ&7hGDk_Y#D|)%J%6w?{B3ywJ z9_Ok&HP9|duFCX>+X^|1PMFb@Y#A6)9RhTP=DBok$1{~LxfO}7zM}RU-b=tNPe?C@q`StU%*op3+>{q~ z^95eWfoM|?V4JB(ubNa$sy7AKHM$^&RwiM@ynPhAnA3tVs1SYt&8piK&_O zU+O1^O@Uj%rjR+|rwBz|CK2{v@XiAQY&#UjQNn1CJ-g!Ec?`dt!oRbsVP%_3=bulS zJS=`+ik}HV;Q#JoskH;uQ*vS<1fZqxxpE1TNj9$9Rta0>SGx;4bm1k%`*D3IRx8S*ho!pZhBcZjs zGq#OGfa=;Tyg%s>JLd`n2x`Yk2!vr#IQ}qcMwBDp$a;j2E|?E*vgv>CE9shouW-q| zI?dLYpDtg4J|V^~eWi{P2Wj0jiWp>mMcQI*)S(UB>d&5;72J0Q;SNcg;ZJz2!GHON zq*pHl3UpAWD-pH92yPK-0(a%qX>EDdc8J-+#u8c?-Lvx)RI(scY^`y+Df>1-r~HD3GjUp?9;{UQ0DZ9Z zcd__Vx6I`khbp@%9B2c-cL1YHD3poajQ*u@ZUy|NnoH={MsCd1)pjF&hsX3hNBo^f z>5AkIkEmT-~J`tbNrvNWsbkGWsd*JmO1__w*1rl zzmx7?+nxW)mRnV&eoR;#qk<&xz6tE;A;sThSzLI2210=b6SaL0P^aK$Cxp<~FwM~2 z61A;K!1)e(XY>d^hT}F{T|GU$HQF~x&9$0wked1VlggLUX0IV55_YY4b^5N&=S%2! z+qua6!suo6B_a#LWK9{iuid|dt-Lna;;J^H$(Ol&?fs9mK2StOp7#rn^;@8w(cUs` z%xl$oAF-a2@lMXU8p3jfP+O}=Njov=aK!lbit~vKrXaB;XBV$1t5dL2QjAWO3bD5s zF=fyripld3mDU+e_nnMbFqwoomB8c8b{kp4bS=xRAO{jsHl#2cnm<+L=QX27}JX-GwpYmw%Xi-#9bMlz}k0ly9+S(twAx_Y&NmdD9%Ae9gQI&ww;QJAIii}LV-omN)`)d0&@3!2P-f#_Wi0<1Yw*; zMll)J*+i*l{=2w{CrH-pwDAqhi{EM0v7LJh*S#|=zL@??)Ljqp{^`!fttIK@Bs2^8Z2 zX8uF#kn>lMBhb2c!4+`5%_Hl#Sc+^550aHMH3(Rakmtg==|W`mkm6YoECPIos$e`q z2@lOjM9wgNat1w_>(HY$D#cPX+79kOq_+uWX&c)+M|OzK|T9!mON$lC`;X&bMx;NoF7eYSmPhR298e>2@qhR z(gQ`RuV3jA5TL1HbV4Nc@Ed*X!lG(^HBgjJ4TJn_+ZHa_e31Tu{<=u!Ze=)AP*{R$ z)q8lS)=X15#YeHup}R3C5l5ag6oK2xikYZUJ)v>=QS+><_I>OEcH`6+mS~jgChmoy zxBr%_foki|>s6;Acs8W4q_AdIknNKjEIOV%-Vw6v*p8SKqmg1<{gKK=Y_~Yxum`-R zHT4`;RlVOl*#U$FLpM|N+@M7+ReD#97texPw5OXm9;n;zI1{jn&*qXjOGJGK`g?JO zXb3SgJgb%3c61pSz4$*m4`hWprz5v->ki3>Ih(cnT3JqvI0@}L+QWV}f%!~z;z%jg zwqJ>|;(1{3&g~G=pImgonj=u2-7jK#-lN!doCUSktkJP(;fr&0Jbkm1)>q6;Y(*=@ z@u+?Ba(o_q{p6FIqCC&yG0@L2IVbZv`^A;Vf^uX9+TXwW?kXBApjOBBCFg2YtU<|s zy(8BQa==W|`$Vku$cWPR0=mv|ll1SX{0o|XVi*AM_Ok7F5%vof{}q-0&qNs4zlkue ze->d}|C0#&_1FJLsQfn~Oh+bmRTSwaE)y|}Q`}Qhas;Ak@!9RC5s###5a|vOR39mB z9&}vod!iHY2v~oj?qt$=ySK;N6^YsDF7*W?9$m8c_1#{LHoOYn>`|TGSrfQUs^<^S zi7!?H2wTIIIdfFcloz+>#J3lPnA+HUUYB}4{Ssz{D@)g3ac|GIUW8s*;&@Lc*S|eK z+-)v>eBF&T^X2Ceus(}@l(>C(JRf%BLs)PmulhDM6SZt7SMau8Dlzz_kb+B8z8|^7 zKghDL#6YYwfkokzP_U1_ev4C7Ibl;}Ve6v67{ zx^k8c%lX&W67k78MO3KhdW9Tb-dCkt!k6!eCynJ)3~ZbtL31gYy`BMO)7sF6J#J7e ziEN`L)9l7Vh!Ql#Y<*iPh-cT)-t%4?+R^3BwTu%j`YZvhY1pmb>Jk7ZtHmo*PEJEF zy%=tvd>20YK8;Ocmfz;p`X;?euM*av+H;QU6)w9bfh&U>_x2O1$yJL}`@7JOS2GTN zZwm08ockjCnd__ly7Q|I-`%~G|7hzspLa9h>kQOp@{6k-zaBGz)Mw z(av>_u1tK?=Ca^6aArFITNVq~qM#tq%@&gIUcO;KtSu2Mg)z=(2_PbXlu|Xxw=hO} zsoA%f2orqz#}NLjB>WG=eiaXt8$i|Q$# zA-!Tj*xJiH9zvaVYNrU+0RxNDzR;>Wd*E=TmI7=y}kP;10ZRI@}k@Z^g0l-w=MaMztW@YWG0mTRa))x6L2W7FcCA??uWWL<=W19!V z1L)8yT65LyFRm?XYryc;?iOmKIN2WZ`wQZ5P-k>U#VWabpS<<7W{fL|?v=KQffI*e z=V2uX!Ew2OKJ5lRAp|~6TgFQcTi1v$3TLC9Xth2)5LisIHh+iGOk5~=dN!_ADAOL! z=bl}@)iC_($x19JtWn0UrPR$XWZTYU9?=5)1&l(iG?;h`BP1exs*MS`PmU5cM2bK^ zE$&7NTzg06d$f8k$&$FQXhy|!9Sw4wx*9d*@}m<;Xi+vS+Lrw)Rb@szzBrbOl^jWr zF^fhtZRuAs$ciDQs?~iLTes{w6fxXLyuR@@WY{?09SjD&mYpO0zQ<%rZACJpYE@e7 zu|4Kg2You?E#Xh1G3=kTizg@=KS7Z^Vt^;oyz9i>ygHk&O1=By+$YzSwxZ#QmJKmK zinIqYFO7FpF=*AH(hz->#+C|FEv{GJcEBjs9U-mCV3~9`wR4$}Ej2Yn7u!ajPVk^q z%TQqFGE~V5#^{Tr-t`bm`4M%&PAxFVq7^xm8GpS3i^a1(Bd;L-Yz3y-%N!ZlZi^)t%<0&NwEP7oYlnv9ec@~20w-&2bI69XqDAbzSCN;2Wgp4bC z67i!hF({6E?bq|Jka-1f)jVh7^w}Q9+xn)e52Zi+8ySRs_QBO>d?e5MM#-rSEcMOa z{srN=Na1$vn8daVoleE&yj`KATjpD`5w-)Yq#j^>F6>|>J;s+AfLSnYD0)&8mwscs z_hz@lp(M~I^vJG308W+5VOCU6VCQVBZtB444k<(u?uHz8r9t>%-~Q%YotLh_ZeC`M zU|359Jbi<Fr}?*!p2-I~HKy@_-mJ;|)cDoaY;Ski$oM!XuxIqvNQx zjCp7y$+8+@zN+S=6TWCF+ntfYDqFYjVqNsW?Ak-$2kVLSTI#b5kytP+n-zGAQ=w`* zp3z?82t(LJ8#6{GLu}K~)-e{y%?(zzLNV%8b?KAdUM|~oX^uKAkWyfvV%JP6I-QIT z*=ETH%v(;z3+;2kb~@_ztqs?((Z*6%tVf1V3WX%2CWwS2tK>uB$9IB6^H?%- z6p(bni|xzbs=L>>Yq?{iPkwaN5AJ$_Km2jYUi0mUjf^MAqgX=Ox2ym2yGwkA%X@^> zd!Mih+bj{_uZGKgYM#r?lYGxr6;B%o6J|!sC_PV=R<8_!bq5V0hZ_zI@8lY$8$y_O zPaer14%@ymvR)J{wZChDseMWjoYdG<`m$qw4V88s$+M&=x@B|z+`b?@!9`?f+82XV z7e`DRi6KcF+znS%(05RKlg@7jG z!1T}h`OUiVw{Q9H#!bJS@Skus0Moxq=tj*I8$2~UepHy{AR=RGk`+@G5obVp*5i(t zqUscfQC>K*T!mDe+{!?q+PPRM;>!^K!}ElTo5fxsyVm3}6VKk0+l^26kvqei?{MI3 z<05NRtp`8hIz{L7*%rB1meWu1{OrqpMrG8QH<)zGedXInctHuowe|Q> z{$n;CjNg;ZOi|}#?3;KFCOkQJaJ&x1aJ?GMNW+=%*epd4j*;dDNqqArIOHnU!2dFL z(DP;858nlr0o9`oC_g_+c<%jJZqZ`nJeYZuOF2R@X|`LD5||0@flok8EhsNqPk%NV zeMQn`qxJXhJ-w}zCq~KW=Nfs$Sbw*t%Mz)^*Q-dipO0s&V@4OzfUU%$Ao|j6m*aoB zzSZ6P@71LXK&ghY_wQ9Dt=om=Ow5mJ8F>z9O4BT=2NY%@(=_JC&v;ua_ooZC`Lp*T zB|3M#?#xnRrj)3r;nAzB4cPfqk@CM+$f!$y8tC5AIaa^UZL1%rPHrl9FJC(}xlw3r zhi^~VUa@=O`{*;?otC65POK_DWLTqWWs|AkN8G^CawfM~7{Dbnoo`5kyBcW8jX6T) zs#vdiBPtWd>?D0@lj_ z2m$)MC~lnE&*-<%0>BOZ<3{Swjb-%k?W}=aj9mayATVE_|X$DMhuj`BH_jPfTZ$Y`C0xOP= znEFXiyr**y51nP$jwBwBKwlVD7ub!n%vRzeNS;B&2YEpDB`HGoYnA;hL7K|5N*UfXdS3Oi{MjW!2Fsk?e(X)E6_MN&XoJ8S>FT|%8e&67TwNM8~b89mpd zIc`t@9B`U1p?iJk&ua+174LrWHoTd;Y=W7KCEG&wLc@%Y?${5%iia z3%lh>?sbo^syZ|0DVEM}CAt&az5`=(x5XV+g@>+TP!G5Pu91TUSI)g{%@6jgmq~hK z(vMd2okRw>@djzQaev7M{U-=rcx!Nb=7mFALot;y((@TC?3wCy#HR4vrX@b!QibGp z$ORmsnRBgqx~avfC0T)W1yC1ujY@+2Vb@@`CC;|hG^38xzM{uMzf9jA{(%MakhGBc z=CAEy&SZ+N*38Tz)!dmQTCIqkPUG_)FV<%d##n)4z_1SSR*fdfv0Tdc#?2xs`4`XE z_ObF{7^ff~cyMxXTqJcXimb?jRLV98$@H-zMIx63+ZC;B?S-|nma;l-Ii)+r0f6Z@N&JVQC*Xg}3Ha@V{|w%Lll*^O{cC*lpdo!T zNO%vL&lnCXOce==yTrNS)=emZMAQpEG6VC8x351!eS!fRnhph%?52lM@TvA$$Y);P zdytRaNF)=A#lcClh?#7X z7fF_5T&9yLu&Z>WS0G)2@i+2-PVp_8vKIKEI=B5%o@VHub&zmo6($tq{m-BJCA1p- zuOnv{g5VG}DDlAnxmmI!!YwKE+;vDQj#>-rE29f9M2e^y$m86LP&w}|E*1IGyp0)AKLqd)Lq>hI0f$?@2s63-)inq)J- z-af(l4Xm3h4BZD7>|R=+-XwJ{b+GKZwQ6Sj@ru{(r2Dy5-&=p7baOpR^|_&bwN#g} zf@i`-=`+bZlrZ7<7PRDOxu;l;&^!xQBhhQ$=Mep%UYNkFRKvw6+FZ>0PCA@nCn7Q{ zDvgV<69IumE+5DsX+kBGmQ?n|c1Mr)(7#AiMIM^DUV?1YGEvaR(;ldE z61Ws`sCOK^cQSsxh?_s$aVTGPWCg35>X9*GGY#DROks`@ym#VS$UE<^%>7+eFgT<> z<$~T3x@)YM_DtglX4*f$MB9;@^{cXt988Jp`0)u+Z7szVM(30X-ddhXA4n_gm|nUL zl--z-cm$D2pR||Q8xh+-fVFV-wLq;xze)&`G+~zb+Vz<1b@Dj1ty%&!u5v)ct5gB0 z;jC#GWbB)kN;Y!QsCBaxwAqy3L|#xrzK}-`0d0Z?z;?QlJf8vD=I}92VF}$kL~n6| zbJ?~Mx;R$zqamNmiCPm~*CP=OYlN<+f%#h>atRMRTDPmPyF%WqYY@F)`BaP}JFB^^ zM%#Dmd5R6Sn+xXlC1M&b-UniyN()l%G50n$&Kn|v(Wzi6fKQQkRL7=Xb}W=SOTq-g zp0Xd~AjfZht=dcr_NEpx-b(K1!<(vAQB>iKMOr4kVFUAbSU7bcDz^OA55B0Gen|dV z5x49IXkCqm?_z15OeDLRRI6ISR2j=stCfr$Tk?J@xi+VA9L1Hg`2Jp$#34v@v(DAp znD-u@?BshtNN%A{u6kFgSj)AfYn}731B?dR%<0#4dY56%({lJnSc%H+_G&Ha#;UVe z#gq}(_)=Ue3<~n}D~y4T`E_t;sHQfE8AfC3af~ncY7QDxnAbI4C(aq`+~g~Z4g6bH zdreGeTYLiuTdqe^V;+c?uYmR1Z;o>gg)1PMG^M+PD8uy zTbaB#uRwy|%(MOud;cKOfS+jhdjs@0;QLqX1^nd0f56%QhP{B_dGuef7w{Xy{Rj5` zY`DHzlmD00`qv46{ILJORj#+j@2`XYioKuT9@+jhr`hSrW2A~CAq>A{N-@UeT{#;< z{(U+HiGqC@PH&SKw;@UNE&`44;`I(ci%Xf0ucqO|+ zWV!7*x=mYLXt}#9v$>qCrtK4OvMC~V?n`^;kW0AV*vJR)G! z?`2Q~_zmj`QPgX-6R%;~u60RzM6A2tczO8m)ixOo8dTbq(MN(oEU4m^nb$ZGe=eW^ zLD;4nE^E^3(A`99PW8t-ZISY>H*TKbp4KSCt0YL;8qtZTU(g1$r4d(l)(0wAtyQ%v z^|iG6e+nSRs5yT+!knBWw-+(`!D^s}p)bs_+85jV!Tu|qU$4LhpNqxK8h;Bo|EI8* z9N*BF^fkfP(=8E99q;d_Cgi)-9&L{la*v;j#C=Exjv$RwkWn3WXG}OjW>LLo{0+Bz~AtOTk(ZFKQ_~i zQ>`Q-OTfM(5(Psv1wIc7V*rnvR*M9-hXgPM9U-l9LlP2lGfs*#BGi0XSY%;z?+P{y za(#?fWv449$_zH6rP^$L0LE_tstcS3Z)W;!)8JJb+~9ljqWOZrq$aoDk=>3~$+-uz*HL$w$(Fz2L>+b|s{(oJw*8A!N#)Di_m&2bzwJCK#G6yCP$WC#Ht#*iF4$yKq>@uan9Htr%F zx>ziikoO9!iKSY)Rpl^#wRL0%<7NdO!_@^W;T?+X<6_jYpdyr; zGDieNCTzZ(vPIOS>JHLw*|1b4B-gA5V{aKN0$9jFG6#*rMN#ir;6u&08*R6=7=>ru zGU{|4`CF46G%64^f&dx@5F-Q>;upO5+$QEw+m-uNMuTkICnyS=QBKAX(Z|Knyfs;l zR$W~e8fYkzf0&}lUu(LKpT{2(G{`HZ9b$R5Yz&Ycv!5=&C6|X z=V!VKBk{Q^A}0Kc;Ppf@)gG=D9A0g-Hw@G!0T%qMu(HH%Idgw`gNR8)P9DbCLU&+|{OGhFp2sn<&khr8qdn>Lj1yuiidKyB-K32&V zeKJ#0-|4WsnVv|Iyh`l9N2SCLf62w;8kJ|M*(}#FamLh$nsNfUhGoC*8a)my>!LS5 zyuk2bR~RIvhF=arv1zVy`!wcXJ@Cy6RTvbxVMU0=!#{A8oj(G9 z;`(qXL@tafP*U{R8!qBnW zqwS{0xzw))fuH<8yI_Se&b0_S95Q?xI7B{B+ZmAHN-G}0(s^$D5;o#n{zZdnzF7>R z{_gcAJTFb!*W~)#bCfONd_nb|0}b&r>k+I`SbMxo=-Ecb`vU6l+7G9Jjc2(C?(VnR ziyw)<3RrOkxld^4xwQ?nu35W5xVvxI1>jgVDcI~@AK_Zl)5a z%0eJ$K?^vmUzlc%a8sy;BhpQmBtV)(_fD3SH#nm8VeSPbGT#zUry{r=k-hp(#0^8S zR>GN`!tG-99m7)4THr2X7qfEjC^CCcOqB=vQ>hz<+2&HFn0ADGfqNe}$sM~jjYCcE zh4MasWB2C(G>QNnNiJinYqH6&k2ESU#oIlGV4?|+8ocAr{%I(O3dor&WDI4< z>Ms&G?JuBp3Rr}HLjZo2zn_H?!11=v{wD$WW8C?dJyZbT|4w}UN`d;PC=$RQaZbN) zasz&kRr;?9z)yMipAmq+5no#4F=!k}ZWb4y5Xg!c)R9O6z$X)Q?VT^~JRJFC$|%$* z74b-;5Wxv>R5(QK??qy`^0vV_cU>=lcp)hf@Kl=GSM1+%+YnY*s@BTNoAhopK8_JQ zBF$zQJ|24sAwho=5Eu+g+YVMNfRPHOjXzWz_Tw9TTSN`7;75cZRfeVg@-BbrHTE3; z2`mfmd#BFjH$pz5yG_Ds#`$m4{qEM!$T@afE9Yc#iK~_rA@2fFGN@?LgK@=^UBz9A zVdEh`W{B};qqLUEFgodRl}xmH_dv7EAthj@VPsk7$k=B?3!#s}B5Un^nvWBfbbyB! z9UT7AhPV3FZTgJ{WU~;y>Q(X5ozMtIcRCOXc-&R=8SuI4=|o1F3JWoz3EHJi*nyRj z4MH(3G>Zz33FWM||F-P+PFubTZ7Qkk^1AH%=Lt%{6Vy zdsc)4g(W&kn^^F->TvW0epEpWp@Ri(mz`nHhzaTxE@nfB?t@SghGQ>!f@d!{dB;Is z6NP=iWC%jFFYfPC@`*06Pov{r0 z*bh)fE{}FM2?U;5h)7C~Va%Ovpv@;c3YVAHqwf zO9Ve-Bc1c*Zc!V46D!GvyiY3Gy0l-IPj3SaPR>05R6`eU5P{o}S5pt0+>jopaRpJp z+ad+I5@Vbz+eo)aw(PdmsZmnx!c-UK9P7CSv-K>JFi(uIms1}8e zZ;(w0%`=%YH+!XRpr76c2Nt5hCREH=%a&^r2$9ES*-;S}l@d^e!dA}AGy>-kAGMU# zB%acVO2s#L4TQ$ELz$Zfh%siD!!b@<+!HEMU1*5hsati51ja+p=zU*3ZOIuyoqVX2 z{hnIfs!hTldV5)ztF0}I`Q%3>)8NkFbrk!BXTpiVQy^Wlo}#2BgY+10(|PTQ!|)Yn zH!vN=HC|N5FcUOx4$gqob?L&?mp6wl;Ar2)xcf5D(D|N&#%jpYVIi1B>&h*^dd8Eo zm^{&x6i?fDqu6h8_wU%BFfd>RzZbF=`{CfpTq{~xjz+ljJ0vFSAQ@dYW(ng5X};l#x#{rQwyu3df@A#|{rxcv3}PrL2zmh7FzUA)=cxP`sB9u>_cOk|*&s zU=lU1^iu5rly%mU(E}yka=!rLB;Z82})*R%xImW%zRB@x3#frNd4p}BJg0m!3K6W z9G=hyes;bB&46J`=~Z)kaw$Mf*kJm1wf#?V&+%4W|ETT13hU3x_}AM0XY$xT-k1HC zxfKBBe;4;}#$CU}{GYY`e-rn=PWX>%`>*h{e-2A(RQvT%YdvXYHO?d4mNNw4 zDlP!kvZ!2`NKFtmPfi9#V;wFcb%TrZx+gw;&3-|SR$HxiVQ|Y~Z*t3l-Ntb+a6*d_ z4A<wU?v>Cd>a+-N*iOCy_tZW+!1ROq_U`>mU?i{ZZS zHmPIhS@fsdzR*LD=i0k%j;C{;{<7~r3@_x30GpS_KOg}zS&8i4mpVj;2gan3YX`4( z9tAyD%dWKs<-L@vl5?r0gZJ};%jhnReEwok8H4c=g_rN-XWm%d*g^&vgXhq?XpL3a z1{B)}vYia;3Yu|j%-hRqx>0`^N2QE#N!!}J?WhQ)gLm_$D#<<-<)X6drZIv^1uQw_ zRubDNu-t)mz}$p&h2N!|*OyZFi)nC{OY&~T%WOJ~3TWds^AiIdS)-JM^;t{$vFcWv zq`5eUD)V8`5(qY>Ncl5Z(gXzf?lJOg;&$(x`Yk*I$x_Y8EFWU*`TKOz)W02_Ub|fQ zU6iJPx?UJnWvLwWZW~>fv}t62xGD}M@KJllsT7@cRlfFi>JD|E>Ik+NuSU5gfZwBQ zS}YQ%%Db@_}Ko!mH*7W`=|T#f5(-X|9w!w{4awFmY-Aa|GruF>xBR8 zKK(ma?$pwW#brZt^ZKd`+bDKIBDm673%hnqgZhmz_RY%~vQRpnCRcaVtU3Q{N(xCY z5AQMKM(ht0kL{(LA0E6T6lHkK(z|!2$0i>yvp?`n0;-_g7O9W|C0kw{LqD1*BmeA_ zLA{EuhlZ`5z&VpQl5{;b*!_H~HGpOTfBT3f(4!zU?9BUO_%OC50rv6baBs%PAggO; zKlaB6au=coJ8M~`xw57LRbGTs67ec?c@|T7G;6-KnRUwWRkWzAk5|sVIOlcWGq`YC zQA*sXp*81xN5U#JY9Ng|cD!Q=^&$$X=|&SM*%#6CHZWe!i-k9etoOuLx zIsPd4iXg-&S5xz2o8{?8;oJarDta>0o>sj8-uID7`6UoJ#=4IBs)UK!iBnmec_TR; zjK(BfjPaaL`d<$*cs@#2O_&643VnLzoC67 z{QhUClio7|Zo%6rh6jxOcE@j^FHonauYhGpYp^SRW67RpAU?c&yM4hageNt&6e?zC zgdga<%XRBdI)FG(A~zUJVXGv|G>impBflTLvDkp*WZ})_FcTfeZ#P}j3D+uyV_e}s z5JyHGg`m%(2-Ee5fjkRzgsfOjzLTvpT6o#B%)Y`mffSs0tRPIeUSW7aX6MQY_S;?I zECU^a8M&b92_YKfzA^iT=~C~Qqbtm=Kz4@hDk;WyPr6*Ro6^HHrj}SyI5^9*O;wg) zD#9?WVAX<{N{J_T9ctnI%;DlK?{o)io6ff>n8#M7ggrfTtuAm<9 zA^8KmuYn8y9J3@1L=#!3Rl!ksO z&5HC+)VAxFI!q+RJV;7nGs$4mtecj#s++4$9EK`V4~~+QE^={8_iR5=S@Omjzf{>` z!`-(FuVY|z5Kig@X79wfDOlMNv7ZxhU5Qu3Ibcd(Tt}O8->CRNo7?T(wSaX$r@i_v{ax6h_O-?Iuoc4P5h|=;sA`~5 zv2dLUP9%qrAx`Cb1ts1Z9@b}l-xzCD1FqylUdW;rzS91!r89jHk2NKI&ZY`oA6*xF zR0~EEd+8K<6YGP+e7+*7Yd{d~Vtl4@S*uqQkh4x#xn5XzvGzW$xzK?gyHiUQM1cO4 zMV(8&aoqQ+@1s3f>~{e`j3_7+1Ba-A!6rwkcy=}XC8uf&khb~+aJ!prBro_Gs_Z1Z z>6tN53MplzeLS?$nx~a+5Y70xWPPY*mrczYn;f<}Orjc#gV#4^sLPAqWsQ}jWTT^$ zcXjJ*>I~V?Uy#WWa1j2f6MPn%UN@xzRi`gj^$by7!NvE#WfY zCY#ea6tTU10?eLA?piTY{GJc1UxAJ->5%@e4*yAP+5cha0Qgk}|CQMO-vxjI{yhQo z%}?T=iS64@^S>10|2PB`@HZ)-^V*XRxMFDQ<0&wql2DtH;h=a(N~;tr_s?6JRFbSD z%A_)CRyeA__Q9(uV1?>g`w_6Opp-AsFi4aLSr-lsB?L~}omyw7OZAOgF1qM7D3knW z6C96Sw_TnrH^fHdRMV&X`zs5FJu8$)qVsn@Dsc(0zkn$yHxb_M(RwY(&R|8(#4)uW`_77 z>Jn9|$&DvOOb|U|AT55q^+lYE#oOqwI^r7k5f1OCTCVQbR@}Shw?};i8q)QZbghX* zf-3Mi$eDZ4T;&5leToLj{4_!3U~^_?7cGFKy2RbL^0vzV0QRksp;cWK8PfONpldcT zoJpwDH>bBv{@Mt5dit2>EnpV_CrK)QyZ@Plm=6_PXrk86hC+rir-v4E>5Ta~e5(|D zUF^fT=J~z2+srMd{kH#Zre}>pp+M_2rH)z!;KxJX_;TF}W%KX`&Am&U^>9|(Opm46 zUJu8E={bJ>_*>S`OuTwkoJwI9Z3U%l!NLF@Y6?e`Li&Vk3uzhx@n$|Uq!B$P(pW=M z{34uDKHAIt2)o4&>vZ>3L|Hihg!@CV#+RxXYSA05eh-=5gHTcnI0@i;*@^>VJ5w>a z?^a5j+2Ek|h3c13dx=3}@67q(gUvAEyjq66QKoJ5@fX4ui2-I1ZlXqlGD(3@dTJ;yOpT7!QxU zS}cfqhIx6iyfC(iD>Uk!n~J;jL7rG!N6n!zOwq-gV2r24eeFqtmzl2L|=pL zM<5)Z1cTu2afn0lLtB)AUf-@Qg~{ zV6cXM#YtNq(GR=PUeKBAWeFoCYOhqXEo04R%+{>lXU3RJ@_s%*Vdfd6Dg`m;lRV>= zMxh0S7z@D`y>%Q#pTc151Gtwz4-_q)6q~9HA3Ax^-< zv%LzhI_-pTS_I9bRlJ|*KTvj^?DYUkq##hV-T{pxm-!fDF)U$cHBF|dLLl}b@c6(u zJ%zcOEOU zUIfY+5F}?Li}LsM0-3Oor~Qla9>)zo+#}M>UZ&d%EXs^@>;<= zyHYz$ygcn}JJ#xmjd_CLMhi9xDIe6na+Wi<2E?I#HDK5qUhc-w(mbDgempw$O4GN1eH3Eg`3yJo(G3g2ZdB|iXm)%6d|1d0v$vV*2gXjY ziNF|Oh&7be2>P-6Vv!zeKY`LFnN_RjI>mxCs=k#dCP`55a&n~Y?GY` zYuvPwcWC)WQEV#fW7x(ILJwrbu)itN|Hw>bf3s%)qx8ZyY1fRNJ5ON-RNQZK=u_w(E8?w!+vKuFt`Ck}b z_+uHVWLOTzjP&Ev`_3l!t^&=iQUE1Yqo(`Qwgri=TqVM^1@%lm`fMJ>QCG=acC1gA zQT+LnLQX;@1{3}QJQMo|ghuqAZ+8xK#{I+Pzv#b;b zqtw)k(MvXwEi6S@-yS~Cfpfub=LNhKeiWW#(}eT2dE+0)~3!9ZNPL zF!6yS5)W4Bg}3K*i5IIcVd8t%R*3IyWV?9;_@C8gYYc*m^vcmc7li^?RZm)^I>FOM zvl4-1Yzd48xeGwlquc?#cZc8LFHY@0SLl-Is{lgjl45r*87cJ* z#Y`6*(!%y}piYvZrh>(m@WTkLVF^IK+z_lnu9$|A47W~>Q2XqIvct2;<$@HJd0nUw z2ve&fC0l(Yost36%3*@JI!;&8Tj=DvCHWYps3^kAWU1HoiouJpwXCcNxvBE5_(12r zEG{k43qaqDsyBkOs-=|VxV$uyn6o37iQyO=fhZ-1vvt1^SAzijzZP_#CB*cH9>xvg=hj#w#C;}m55ZPvo6 zmLitHE;cm1Tjsoj&iu!di17+Xes{q4Nj~E8!|bhM>;0+5M>#|@N<brJj@^gHQh zM4Y*FL=$d5-EKYx1v-mCl*^(~^D}!C4rKJg$}L8%LoAvGhc{AtPwKQLv`RH-iX(|4 zUZ`{zVyvKg>NQJx$Ck=hrU!i5r8dNk@g)2_v_<{>jKEC-?*rzPMPS0vjb8A%m821W z_enhA`+HM=f!IrL;z_v+y4Hx!8>#MA_%r4hDe7gXs~YOA)2k6v_m{#}ikf$HxWa3* zW`k?J2Dj&{pJxfVngEj+#`=>(Owh1~rFJzh(FxQlB>w&vCFm#psxeshT>|Cvj|vdqTX zKIjrF3f&96jet#}@<{&smuj95mz9rte)Y1bW49F?=5DEWvvAecbQbteB20+Ge%(ew zM09Ci$`HC9a7|A5NjeH-`bPXbPL7{R0*!DZ5b&=Y%oaMrm%>yN@Ctmg!UlB4NP9qF zhD}U40~hYtEnP zrho0w{+Vw2zfU|^e^-HjsX1@2$^U8M@kh4lf4R>1>xBQT=KQ8W|5|ezRfQZ>0C4t0 zgcxBoEOqiw zTsRBu($j)qI!uzmjH}p1hlGhQ6)7dTOvp|UB}%!|uhHLq2AeayaV|rYdN(cJ)})=^ z(&LJQ=tuB`qwHB@>t*$xYD{fZ=UdjefJD8*=HK0^KX!rG{(7bUm~Z^kmHNMCM^TYAbQRpIDglr6T zW$WcN-ier*C>#yirpU6sR~f95zCqqN5%fD4B>^cfP%IM34+x@rw&Pg>pTcsB_7aQ1 z8xn7Jp5?XEjMMVBxl=`XHfkS@^!aibWCG3I%V}B`lD5*XHcNLtPng)9t8B2kxJtM0 zf-I++_;*8bZqz4U5`_^NCMRJxfy_>RFQJi5g%`R_UoQ{Z6A9CY03T2d3AU=Nwazh^<8Zez4FqLqnZn zMvKOEPFhU{7d(Oq&_M&LKG3YNq^zk&UBj4QgsPwb9^|=J#C|T0evx&=DQ%Kg)Srn zQxx{-^RU!lR4*|?LxabS1gRo`o4nni!|PrgnGmMHDpL(%eMYsfcPv3kE_3?URG7W~ z+r_mHZ6@Zcd3UPl=IFSvg%~^a3$?6~p(+}p%mqERuN*%iyekPp5B84XncChm-N?Q~ zL(@Rm7?apsgYYntQgx+$gn4v07X@8xfSYLZ;4E^~4VnUSD`Vvjh;Xuc-5y(RaA~l= z9yEo88sgB*x36E;%_n69Pk4D~Bd!uYJo!m!@^yV_2DS}mTb;PAC)qO+@?)f3Sg+~S zvDEPM~#^v;U|ex|)$3rY)`f$NN$2VxIN9ZfvLk#GShbgTpz zCBOu?ngVE$--7{}%EN#NiLFnr#{;hksE}Sa!V%%XbAqU!mgpcGzyB-qWRJv{@Qp<+ zu+KuWQzTCNQUT3)rK|S54jYwHAM_d;E)<4=UFbQ8x>T)wyM*XD|ES-{jW*ufdcuLS zlRj#@`MTSO9q1tBF7ObLIWC8UAznCbFVeg-@MlOv;&EWu7KZC2X(Ru9=A0i6YA|J#fku$N* zuwjg-TiuR-6%+0F*wI<+Nv2=Nz8{0WEHMee9ssW}l&ykhz+Qy)X7&e;$()0Ra$1Qd z{A4uMQoR&#OnD+uy>YOl@zvSoj)Xg7_|9Np{D$e{MJ;)8M8)ihG4|EN(waZey0aDS zL7F*VY-K=-SBTZUG;H|cB{O@goKB`o*Jm_eIuYE}&hBV5x?+6k#rwWS4s!52y6!Vi zgify-R~}GdhlXoUp&9dA?(3wv+6$nE!+SZcf&(xJd0=S`)&7rtJSMFjmnQ0~ZPz9X zck2q(E!Rgm?`HDfDRWx~r(OZK^oKlNPsDD0`}Db^OnGU&q^t;6X|W+B&$LJZ{pqsq zPSt254!E>b^~)77K6jaO+xNPGuWV*k2PQ}zFQ8_fKVtp{)PG^xFQUZ!_GJ1GasFrY z@Ly5=&jif>ohbdv&HU%JPqv?hHDM3mTm8>{~fs-J#sZEV%7 z8wN%b7Tjp;+&ay9LDv-w5T(=V)20UDNfGX8n&OU#Ms6i~5B8nS1Bl?gAPVO>kY}{x zMXyA8TZx6a#e)8EwwVTtir(znyw119c{OiQ&p|3bNTE2{-ph4jzI-~9C>}fX#4d?? z`0Vp{lFn68=6-jj)=)@EwFxD_8tjT=_?5(F6pmW-hNb-6XcY5Nj$U^)$dvr!VK~nsb8`}8A(df`@;5N%&ya{Rm)noTG7c{;OCP+xuT^T46!qA z3EEc;zi*;Tq_2?@rM&$Z(F2pi26(PkpbrE$vQQOb>D zr|1X<;J&voGLF+QX|9^f{XF)4@bW`|iKlthPVtWB1&xtuc4ZtGLlVjbs4im=$FR*d zglBd~#CLehte$K5`=>fZ-Im%_YR<939NGDY*eE9UU?`fg`??=akdy7di+*llHCJxUGBIEE})9N4ukoluZ9qm2rw^ zw^fOQ3HiDYz^q!*XFbn4HHXh3(V>HPpy|Nmp}EuiX3(lyY7Yl1rj z2=4B|-Q6X)TW}2)2*KTgySuvu5AH6(-5uUZy3^e=({u00z3aVq*PN_GJ!)gLHUcLAvBgNWNcOf$M(1h4$fA$*Emqbh|&Rm|*`k9A?aI-rb z1fh)4LFuE?6T4u0gMCp|;$ki;{*GyJRGX_u^D3Oi4|{k;ZI_NGk1$@_C)&Q)WWi=Ky<#HBt%( z%$X)l*X;=p5Yh?r^px>uBXw^-BypXiW zGoLvX7TjH*JV;Fov*Ot(xWX^DN=gVhWC=eEN1+!n^xCWEL=`cVIi2%*o^11*%QurxU8s{<8q=58?TXoeex={x|jU^O%mmP#-^Iwg0~Q_?3|O54@ZVKdF#^s*fKe zp#Qr1_>FAzzcmV2f5~$Eqxz^(`DzX1<^-w_7A@*qOo(i;IH=>=`?5RH0TLvBSTV5* z6#*phe&1P@%79ry*m$uu@l_BPh|CWreHO%IFL`o-MuCabv|ucz3)aSJ+-lg7m!41fy9YkD~=BUrS!@3A>jOl);_BlNf&xZW3w8%c01*=MII8aqz-P>{>qydy*KI=&zBxN zLJ0jchld^{_~7OODlR4@uG=t6DJb(k;IA0sROUBf<+8B~qR~nemZQZ3s(gl@HcfYsBQ)cYJ1bl{s%(&seYDezJtJm25qm%_}_(}Mdw!C6G7gDV!^Ov=22DDAw z4&YBe#oTX1!+#WWe?jW|?}e~_k`4dC_wg$e@NX!Y_17rZzvxc<@q~W}CI8BF{1;+Q zb;+Dq7`3!kxjgD~hkAst9HJm;^zP(ycwo9v2RLYe4+##Q+9WY`v^EOF^)%b5a5JI@ zfbNQAXiDeXxP4)6p}U(g)7;W9vv?Zkg!0?3x2^Vj7Z96hSuUZO83_UR;lM~%We~$r z=!BF%*}Q=Zw9WQ+7I?Ucyh}cMD>Q7ydF<6^raNyQkO;WO^UaI+D$$VCC+w@JCtBIJ z7MDzJmBMX)MK#Pt&1f38x|l|Z{DTa;5NyyrB@;5rEV*behUiV(zvaD!OlfxoAYgby zk8m5rW-W?a8|9GrQhW)`Pd0bSlP)un+1)9@FlEPScZN98r7F+yr>{khe2!#cpbl2# z!{%B^<>h>w8(qrNvtSbuhBNYp2g|tXROtiE47o!LAADx@n077lD@)b9w|j%A)R=uT z4|^s9){LVFla-kJY8qK9AqE7OrAw&1d3{lXd5nHwZMhA*3wcAb@%-xqS(_PlHa$<` z2ql8ofXaAR@7O!3&!@5%3D!}(!OPxuFtV2DBVy`LTmbR4osu>&^^3ks?=U%&&);sn zMK**PON0H+&Osi1!mf8_HcFkSn#=01>7D0N*`ORAFvnE7ozYhM^2fx;Dru^j>`ToK z*Hb0Q80TnaEbI>{j0K`C68C)_WW*vEiY5gnQ7cKhOen^DRjtCDgRSh8Em5O~bjRpw zP5kszV)`l({NR8S03#pcHDwmN1Sh%MVZS=2YQMXZAUD+F3ni7%Q}+slqHU6w(PK85 zeyI4Mlf+;~V?NaPUl2>nUqm7z*lO)%s=i_)x!vNAop0C5u;Qv#57L$=dt53lJM?9K z8ayCGsFG^yp^rlD3;a;>A|IY51MyQAn66%>A!(2?>pmj+Vw3a>vte>t9|w@@@a?#? z$dHv0CqSop`3lX300{VYXARNSK!~j?SV)fosAvx^sB5?f-%&Km8INoHML&1F;u&kh zOb&WI`sfGtj)k3k=)NYQGYv*kH9khT2IPxqszH4mOkgf6W1cQTq*v;8PZ08On1M^9 zic&9br?@$W*BT!E&gKjFt3HzyMzV6V$$?sVlOdKf$}g8T1SHc#hx)uoto^vJN`}r= zw>NtdU_iL)(C1tI+daL+Yx_O|8B$;B8H!QA`xZ=_MTSLS?&7D0%#FICkf9UXxN$Bd z0Co_I3l2qV0;vEqZ?4dY$c%4}4tVkOH`<%)YX z$4P2lh1Fnc-;-Ig2tMylqE}K;dMXX^#j_DY=V^^#jbUFD^YAB2;*@f3K@n1r@q%zw z;aUU(-|}9+_vzG(#)lnWJKL(@d7Jb?ug+)V=Ir1_vs;~)`n9`+Ql0jIUI2A!HzrPaP9lTSBPAvkKarU=jokp^!qD#c8j^T3p6A zjDJI%)_E-`)`F365elAEuu;x@!Z#$oG&CoV%fAA%&?KETUs8=~aH0Vu`2CzPoQQEi z0`Y!xkU{4eOhDoT?VkzipRQ*H2B1gbH%a|xfRy!r4v_vWLn^~xQKT~bdmO0@|1e1^ z!=JIFDgps11AH#`op@s`D-PEuyzfW(LJ^OOaTB~m8`aL3i=%l18Q$$*w;sqnu>`Xa zq%sv{3S{_v-mGc?fk*Nf5hd)^-|l6xBnZHiZ-`5P0`d`2+*Sm13$wr6dUVK8BW@b) zTK@7(#qK8wSLyR*;%R^HP3PG{#Nbtv4$r_nEUnC1$)WTH?$Ygzl)WS|5`}31hif*2 z(3o7E*b2eX0#Q^(v{5iCAu#Rkz#-kt#rfr%xB+F##(_6b;w++ZJ;Sh7E;T%F-C?{&xetE7A=qp4>*>k!gHN$K^{32^y zMa(Ra4W6M-XcK@^U|3_`l#M0dEsVjg>eX2_?BOs|?&9ir+tN-?u=HgGtnA82QpQSo zr{~iVPlr3Z>)FQ6(hv@H(!>4E*CEa!BT-(yYiu2dj{~3#{?raXe-F?6py~wLWPiXjY=7nT`<46hUrovR3o_9EGM2;k zw_d**wXf#5qKK(uHF%I}A{OE$PtPE4ag+I)36LvvaY;a<4UIYn!eYEb#R=-P z&R%ldGFa-1Ew`5;rFo1pD06cyar0JEyH*ERbF(aIFI{>nPfhiHP!u_7(q==QIJsX! z;vfxS>}h$IR4|%#51(?f9VnJnqNjKBnj1Q|q_fA@;nm4Am|grEaFSlb?dkPAsB0{a z2ae*LTP}{~$peJG2gqC4)|3LQOBK^i!7lPa_*&d(9b3PFddGQ+BnOSiU3i4Vq}hAw zsZhIDF(2v3ky71z@`-?uhGYsk76@t5WsJs!X)lK;v?V9N6X_K_lwpsSp-s&QLvk{E zycUw?sUHdXLR09%j_{v!`wm6*%xr3c&<6mRkWz>KIe|YQO((ggA>)4Y6n|U~sd_aO zWo)UQi#dcF8^VnRJemlT^qV1)diqc(+i&nF8JlR^)A}elA{<)xc}xlVP6AFmUKb}A z(x7O@=e4S?H;Qo*2>7DLH48(nWcP+yH$J+USrJuU~7))vK5+ggN`+;eX}`dwt8@(%Mwnlx(p5$IqEw-NpAN|opU!nqnt>O z7i;r70z1881)<^WlF|)61Om91*h3`@2dWDGx0tmz*epCfwtqT zAd1;A3*?W1jiMD7XWa>V4=G)*Qu9*gC3dJ#rNDncx9WCo!~?n9H^C}nnDcg|)vvVw z07mNWHEzPWP#cu7?xD-*4rkt(S945kbttd!QDA{|u$=ib$Td$~lcd*lYH#A(1s(QV z6m17hM?AIGI5Kry;36Z?`f87U%nYAZQZ7M^j?YwzK?}Zny6C zi>|$ZNc$5%5f1Th-w5Xtd%Ob{rc5w%)qx?8deY_U`NzJe%=i zb;dxQkY*`=e#y}A1?;2-rovJDQuWkw6T&obo=`MptSqUl*;1+a4b-ON{+HQz5R|rq z>E8SEE*^+eXr!&_?H9=718sah6Vx5Lwf+2Gqrq0~5V;wRgDmNX9H|;k%^Rk@J;&iB zWU__o4(Ndnt7;_cbzoxXDMygv%sh|zsK zyVc;->h=hinak|-{@Uew^ZK$-F-9^#Ro5{i`y}dmfsG5IrhYL^!AoLAE(Q!WU#)V$WD z*In-A65)r+zUEe*URfCV7AW+Nw`Q6l6635{d!W;`m@LGSSr#JNPXaOm+<2t~Eetns zHEh?56;jn=uS+|uv?9JlCQ`RqymDRN#A90SCo4~o%P{d0Yc>c-qSayCxb_;7_6PYh z)$|*JW8(Ndb?1j-_^yWjVV2JC@hy}!Er^yCc|p>tf^uF~fwen*(S?+x3BYf_i@8 z{ILrrgD6=Z4%9>412E6{feY)eLU2FzV^poYbc&sQZn!5h=(E6T9Vk*we7Z)sg{mNm9eqx78PK{j} z9D5lkqUe276KyM%{#%tM4?fX31`icTYnxb0R|A2V+^^40@HEtqnsV6DHVDqwUNGxL z_g`wBg}>lnM1%nHt3E|BrMC>s@i&ZqT!_b6$3YVw9U~9*J)-su3=&#OG5< z{rTFj7bfpSEysslw5HiUf0Tu|%WD|(;GKE@2EEkunHzui&7-*UR{RofFn;$1*Xd<- zTpDR639}Zios&IvZd+x=HsHQl$ez+~`;A>vsO(ydd9BiJu@z%5by>a+XTQ2FoC7e$ z|AVd~Bl=}g_P$SU$*9Ww8IDxN*cqzs1)+8xOBCjd7Fo^e7rR9B=@!fxVxM^O>TLZf z_rteNRNjf25LL;VPAcaWCavXL>3nL;zzN(16RHIPcPhdLU-|Y;`#`99az`Qjg4;XV)g3&ZDnZZu$|fb9t-8_<-0?*QQHKChlFcBen**dU zws&t_HK@XOt(Uf(0$IFa+)I_hmG`T15iC`~Y+GfH$I#}9KnR?}1|2f8-h=CDQW0M^iPOoQKpdS+Ie546y3GN-073h_jv&9JtHb10!LXwVC+&g2Fd#bOlds z84@~v2XDw%r(-cjhWeoTOHKon%GS%+i}brN13_2kzFd3(&xlS$8kz#}OuWQUShxj& zfl5o!p>gRY9jV1&Ser2JyuFVp0SVjhmYDVZz7B6pr`GJ9VmeNL)4xD$s20wwe?xq? zfgQBoJ5EJ(m(7eR`5-gx>9rBs%>QhwT@tIWVY6REJEJy;Pq2KcU4{^bmzl&7pKRfJ zT~ir-srC_vlJ5>|5P{0ZCct9U$RqapO>wvnzJ3%&FF`6dWYMeXUV_qbi9-x@u&|+y zfXTYcPW;$6PheVXiZXwqVt!*>1s)UsyOZFDK>9_+{6QxEOnUoWqW`PsVf-!YD#KsV zt}^_4ysHfVF!3q_`!9auKe}@dHMGNt+0ZTnp+5L`gHe4zEXPUsn24mfag4w!01ATK zER0Qo0273)>x_0udcY_S=;)G&hI$fwkH{Nrp*q)EYmw`YK0}Qxl{cYmvf$`h7JlgN zc?YqHlC|c@^Oih!?fiOV7jA&Gr=CrQ39W4mnDO1&Yfl+k8}MbT$&M(YmQw4$)E?#O zIiyvR7lepz=gWw*m)jL#mh3a`ebh2tTI=*g+8CM11)d8_xN&ry02Mx6NafH6s@`mj zMR0>f&cjJ1OAQ~(9vnkbuY1yoe)jO54Y-i+PTlD5PF>)3Q~F#~HhnwT46YEjt(erY zGXVJOE=??#YwEB#^v+rFp$#wVuefooFmfB~Y$LoLccOw`w`K?PCGukeSeQS&x-xp; z^~TFo0H#{O1jpsQz}Q-d!Bc3F97%XO)yU;tQV(pVA70&5pb2CKdD4zWx*n^VtcgS0 zF3;Ocn;uBlvb&c^JVUmV+f zX=>}*Pw;?L^}?Fx*VEOYJZjQGr_#YO`7p&?jJk$5t+m!!Y579gE%8$StVe7MAx9giO3E>nF{pmruVe{&kK%{~lZ!FblF`qYdvooBFL{7YhS z%7`fnNW#Q$t^5xEBF~476b29q$i{Mym(_RrV znZ!woWu+fQ5wbDvSN|xcgHdD^(sPb|yM&E1XaU6V=0==cU3H{%2qx1A1nC&w=uZCH zX2y%U&m`0Y1A$g*}gJT1L%pJ%*`HG>i(gHn0^lGSG0oLm(* zgsxh1fRt9ild450T^m9iD`UJEEKbCLf@*&8l6>T`Y=G02+*97fam!}2ewf%YS=OTj zbkuJL1Wa8kb>+)Ol2$kgs+~K@P1wyCyowwEb3`?HAzEY&L?Z|Q<5NjZ#1&$?9O(uT zg%bm+RWb#-U0&D0dQE@eL?-7$bHC7mrTBHEy^$0YZWfY|Ow^HJdi5+Yk^_Zt6vEig zJe|Fb5xz`LpMmp}#GHnSGeX(mhf%v+$3iA!-H(7ORFE9$aO%(=h56zs@XJoi^-GT- z+mFK7*U2V2@rWl<7P$UWE0oAore?fR6Mg1533UDO3rZ;#2D<$t>x5rb7f``d`ni>i z5z+U-KMH&rQKG#KlFp;YdM7zAuW{KoZcd%Jq=Du8Ru#ySm}|RB7eOyy&z{sieJ?db z2})(^+AfAnRAj{$5RG}e1og~d>tY)hIr4$i9ctQPkwth3K3zk!JU4LPd(x2H9YT|C z|H$^vm(vtQJ2H(mvc#^YXjO%O(k-~M)cJi^r8UiX0pwg^@xJ9tUO%K}>GNk!mp0!D zyTyX#ae!5z=C*KzQxH3GY?Owy8twq1g0J1+vlyO2IVukMy%HG$$JcaIMLq2`jv$%# z{4=#++BGj30y>3sOV-@!vj$Fw zZeA#QT7AMfJl0Dux|FB?9|`Nm-m~H#2*%9QITB zz_6KGtE#=QJhp@P>?*x!V3;eeZ0K9O)Iq{D2l+(_QwO9WTaWQ}RfawOMGpd;ARk&& z`tbDz!311YdS@P-#@TH|2d7@iUPe@H0nkmG>9A|L*6ij&ZhnPB5Cw%$`{+OO=&+x7skZ=sH zZw_A+eir3qBG&h2%%y6nPZw7i4&_>HlQiK;xVf1@RM!(3!_DFoan<2#aTi(|w{OeU zJAR6l<2A1ZF))H7v>Fnl%Lb-iEr|s7sA$gM3x%abiwe4aNVa(~+?$BsJ-_>^wled> zVzKnGk1m@b#-qqtYaqNnSe68(M%v6A$Sv_r2aZOuP+(a;pPNjLMjRLC$oB{l?OYkLb< zh*$v|NPN&#HX~LbB4ZreaAt4Rq9e*CUodvgHB+Zk#^tBV@knlQ^@10FmYxpmNrG(1 z%I&H(2>kh*?sTm%G;?*OdY*{bc%^(5+Rlspg|c(M(ASm*&sB`tA9WAovDn%d0vBej zapEegy>NKEz@7E15Ew6JYw$j1(@AaBc-9}_)iQsXEor>&_#~;&K1dpidg4p%=A6?q z)s8dPcAQ#IFxTYr(eQX+lSfb;f(@x9@+&L!{^a{+?eXzT-C^LKW)17pONIGDjo?=dC#597)!;-7D$9*r4=VxNGjZoBaHCwh{SFy2|4b=B7Ud)nXBtD}A z-Lk*^_M(4LN1CVk>4m^a^u?bcyq^IdMne6Y!2HbdR7h&g_{+o+u2P(4H$P{Xos1_vP#A4(y z>839#P$-~32`&>KM|wlA$0)vR7Gb|_XHaJ1T4G^3{r|eDCSh?$Xh4GL2PQ#ML-si}JK!GLSTWR(Q{-BW5iT8cMRSwYt)K!0HYv z@pASOXD|()clBV5Zb~a@rAri_<<8*hgz_GeCndvmP13Bo3|l3BQGf8=W=?s&RLn5D zwopnAF=r86Ch4MQ@}5XTevB%<-0PN;WQ;JFJfQ;EEV;z#&Vf#|*V+mN+cKXuu%QS9 zGF_-4>dI8685QO$BWfP&BsIInk3lEvpDv&LM0quw&dpeAP~;gDH0c8%@>{6ks=etK z-CuMPXd?ks7odRf4ky470=|OXaHh@e?we{CB_;K%+LnQ?GFrIJwcZNhL)KvYS?=1z5^2^g_)paw zbDygY8%e4>D!HI^Pbu#at^1e17ImIz6$z%t0>|nRx^-1Xog!$u4MB01Vh&Y;3VlE~6=9wRXW?`cw_ZO);g?WwhaE zRN;72Hqxp9)BR}47vV2eL;H$N=0Z78e5|R=^~1$z(O&1<$xzz5=QaD1iP<=(XtwqcV3YBiQ4jVV%J1p4RS{OUw{rJhUCAG+%!cOwT%^wdF5;Ay-pA(4u`2Jp+e`pr25- zuzK!D6QCg&;G9^$WQQ_93@Nb_fpn2;BNwO;26cl(3Cw$)K%-0%)8BJ9!N|BmIF-|Q z`|DyXfu&SI+JbdT>$u}O%z|JneX*{Nj`_+O-@}@#;(0LazeCZ9nh8e#CS=9vW{x{i z&S!=vUy~|)eEPNUVOy?rZgorueY1_95P>&qTyp?&@sKABv@I0_(TEsC7KU{^H5g2Ei^TxUE z+pnCIYT;7L)c*6{95{^HFs8t6IIpfW1WRv;JA{8mG$?+|)Kr2j17A) zWeZltlG(!kI;UHbFs|FlMFNtBTjGtVkW8kK`KJgLxYf$-LY8B4Vk#-WNJ#4&6g8#j zbm8}L<~@cf^^~N^)uZDu{Uu8`#oZEouCF{~kw{1JbcyLFRXT z`qQ&Ax27VP$OD&ZVxOnJY>^_oz|j(W%R4Gz*@TJVw@?6Ly9&>gl+(NFF36}xp^BMN zvQNb++7YNWUygynXAb2p z-w5uY`k;j-fO&Xl8g8J}|7Azya4*m?=4R)0qe^44UBWPmlM09ctE@U1)S9cs<5yQW zIxvhc5}Kwa9pdhM2--A}>j~8Nujx=t)x1ky4!_zuX1H|3&K3JE9<|_X%+)8N`*uFF zfFU%yWr=hdAK~a$M@>;7Jic1<$1WJT5P5;w*Jln;HBU&&$A|eeK3nS`o~^RJ&!v+| z56|}jH8c{H`54Pgjl*!D2qSdeC{y!8kC5rfId4!P@1LE!`a7#MJJ8|sGtB-6TIt`N zO8;e;02shvW@ZOE(0>nQFfg#uvNAF=v;B{SGEB@IOda)U4Q#CFOl)jSERE<)%}h+4 z^;}K$>`i|L3;w|dGcdE$vN5u<09(w=NXyPl58RH!$VN}g&cqHp+KQ2d`PVXSSULe4 zJ3}KoU|8q-=1O{4U$1a#2-F9l%i^sEhytpCZDa($YH8QiL`{hPsU~5aKZ)u}X$H2kCV!*1$WWZ?5LC;Rl zz)Ek(VaU$Lz{*0;Y-m7F&uqkE#9(AV|NVoObl-aloKwHtX#eS%#L7g=3gm#}fCYZP zzs?#adf?Z?0?haaCRP1!OeA&&dRlr$M#kSJ((mrz9~A9B@%O)%4Fe2r{@C#LhisUi zjPJj0VqpBqJo+z&$bLNGXJh{>W8wSOzwd4NqdT}pMN$@(A91&;Iyb6MAkQFeT}+}yZm*Wkb)9{TNCzFf98{H6f%z}? zofJej!vjA0*sW=@{mJwLnAbgKxt&2rxae!!F+Z|j+>yw9&5CU0B8I`AQ-O!1KuT;G zUaJGM1|uRj@H^t}oZoqn?UH>n{oL&DoC>{7-u#9%5N-b5eu1+gDIW?-Eb?JNuv(<9 zmfaSO5%Op~moQ2V0@0H$SA7v@8x;|Km!GC!C;^j5%e0IOPICr}FJi{hOy0-dxBi55 z!+xG{cpD8S1j)8!4Y5j+%-R&z;N~Hwk(sF+Ig-qNV@05i zp+DL(LGYK^$H0}mL4+06d1p=Y@||vs;CJ>sRmN%wgR_T3JG(wE&%;mb-V{h{LOJQXRr)f%!!!72u2n!_8bsRWi4|$UeqXPm? zWGcyCeCcfz9#iOpOXESIpyHQmBP++{UEG~YpeaXFm_h12)d&DtESRvO|^ zcm0E8*SxOyx4}^ERlvAmVbT}@SMax3f%C_J&r2<{cM{nl* zd$q0kg=e}0c7}!2z)?b{mU+f2^j}B=X|ok;7le_St%$}k%Rr=?#C&U#`V_t-zsq$G zB1)n}x806edW!YE-g}D)8a}hNH*OF2YNIA9=>)zG+0SgCmuNDiosX3|HqbSYRkm()1 zO_g!qJVE*7mi7Iuo8y;t`+bFT06AHH#TUOwjqip3aH<4Q1F|pz#Wyewf&IHgWd#P3 z7#WypSy?!~0}Og5TKXSl|C+kC(lfJku;Db>Pn5s%C}` zra%A%6xeiPMnHV#0L#D(WH6-@w6V0YQ?%7H09Fz*axyb8QVUNqJ)UJ2mk~W0006013WJP z1ObrX5ReeykdP3NP*9N2u!yj*Ffg#FFA?AoF;FovF;LObvGGXov2loS(a{O035ZC^ zC@3kg@M-92$mvPQDagP75fCUSC|GD%6j)dka%^;L@_+jE+zvp3hVX*v1_OBw07U`; zLjrm32H*qR2?6rs1^D#^0tyCfFBCKkEF7>x4I%&(1Plxm91H>i92{8N6Zm@oI1&Uh zAp<`oio71w>vyP(K9L#FL;@8ZXbNK|#7z2jzA&)p7?@btB&1~I6qL*?tZeKYoPt8a zBBEmA5{gR7DynMg8U}_&#wMm_=JpPbPR=f_Zhjy913m_R3W|!3iH(a-NKDGi%FfBn z%P%Obtg5c5t*dWn?Ck3P($m}5KQKNqIW;{qJ2$_!zVU5yYkOyR@AT~a;_~YH=JxJ; zzd!(BzjO=u{!7RHs$WRJeu096gMmYR?-vNDGw=dK0*4@EfJEk(hthk8@|w{H8dV@N zqoMW`j1EDAY9WNK0^6?Atq?{Hry!4-&?9&V{}W>C&>(=fG5uk0{p#; zis%blz=&u}!E13Zhiv!DM<$KSewQo7VD={DAus+%X;<|v!nJZVb5VEA%L(4T(p)wt)ZU8XG0+uMDz#|zW*&1XPrrPqo`R%W;v&jE)#SsLBLW%Sb> z%bd&W%iP-3c@^rh+D+nZzh}VAlVkVv%!eG)OAm^kn>pDXuN-$R?%)*a&eTTnxrb-K zYI4G!gagTD!H$wLFUvEa+05hm@{#H?dUMc0`E50C4c$BrZ~&7i+va5*@l)vLMX$Xy z0I5MA58)msB(FYPFoV#b;isnNo#mWI-%kSWC|dQsT;4l0O+Etz;m(J@Ig#wI%f+2%458_y|={LZd0?CplqFSB7JWNa(~pQhv=%wT*zZud4A> z!g9G`*Nc?v63RvX40w}tY1iDeZ@-n+UUz=R{;s}F4@Y#}@b3E4rIqz*UQ<&f1}`xK zgHnhafb&t7CGl#o?DjV8>51)T5><6xAchbPGdqgMO)(~AVKVJ7{_XkYyJvtG&V$+e zImc%JuPXlaWm`|tnDRgqTw)y0AIidOmtMyI{Iw{xJH*_s4#Z6`OLpY!O@K<=WJ-b4 zV)grxJ?^{k_sgBdc!Cu0*@lRA&}FeyQ%{I4ISHYVPg1Ms)lS?W)t&+QS0gnZTQ;#@ zgxAT7G#&4r0g3y%45fRqqB|qXnSQ+*p}E4xX*nJZhsg5ENvmm%fc(U{)>Be%n(F)( z*-d$t-fPwK!i#j2Q4}e`BvI>PFIa2Yf&vvNZd*!aA5pC&Kp_k_lLEB@TixVTdWIW- z!n7W-7~r`@nxhOisxT&|CyXiw@}KMf-WVubYuc|mpcyTgqwFe)_xI-#$~)J;)o2Ey zCw}v0bnY88ZbJV*TbVz~6tR_b^3Kws#+2tb?u%|GrHSfP9`ZpKILnq_*RSBwyB=aX z%y(U)jjLb;uhHf8lH>2Kq-AW}t~cwv&feRsQP_#TPx6YKiyw(c!bt``=WS$~0;;?) z;OB)_u_iW@G^y*P!kc3-qqr(*H{*;!Z2PI)Q+(v&(PFxtrF7x*cfnQgi}a(UnSA6KTM2fdDxnuG!Jvnxxyv7`tYMHhiIrMy~w<^b*g2W#(44$5 zdNyZDTfLgD2v3+kuv3=qJzn%wyKr)f^%Y`m1ZRiq`<6pyv(W2IOIvX>2{J@nN<&W z;pI}wOy6Zu?l*204f0&`pF-xxT}}*73=KubFIRTlnH^DwY12c*9&y^?nXn_nCR&qBaK zGxo!~n zk#QwIMqb!eR8C$P74ZE^N9F@Wxq;}!uH7RXJTj6eyuWO9vu}q=ENTrazCqJkIF4fk)vwz0kK}c+t!}5?X zff|0lV(WTwj)R{|#n@Q(>B>w0wB}}~>abq^pbP3!rBtO&q)S>Rgt#&17N11rO>~Sq zeP$4qJ@Abj@aqLO`vkdt52wi6bv5*!N5ku~Xxt+BMp)nS0Si8aenz2rdxc%jiv(?6 zwcS`z@r#m0+h)ckCVTy8kf(1Dc_~Wv-$U#FDgXPH%>Q4;#`o!LxpUZ}s9y!gC8nzB z6`%~^w5}U&$xfWcVLEC=YM4i{R{UeptFFh$n>&RE`BTMxx1!yvDExy3(FLDzbr6EO zdOWCgPpXe)JS|Qu)J{A$^g|>HGO=t&PhKay3;=+xErn@jgDmm)5fSX$YI+NPb9K)j z#bvSO-onIA*-M9bskrD>=>@NfA*^J)dK5~boPuTFF_)%jc3petpRDSnP&|EB`pt1i zSgj1=#%R5E)KR#qDf)@TSqALrjlg5F`zw%#d({@zJc+^roHSLxFQO>2p9>esbE^t3 z_2gOFpODf@%pIf0nIP`FVe==roZGMRcLS?uV0m^(GtCw+Hg@Hef>G-DZ%4skh|k2c zi>PcKw;l?WLot9&&d>ikCM{9BeG|qLs~;qYS)O?{GM)CTfcPmWR9?2N-PqJ=2?(*_ z4fCkp-QQ1Nsy12&@#HpCxNx!$SjW07d1+!RlyEX``Bl9cJ{ws7dbKFjzvJe2!J^Yb zpB`ap1do78R3gax(V@S=!nR~DMD=3#En4vET%HGw(2SW^Qp?jfcU0i+`1`)c-sCau zwUg9TO~O2|3Fl{kMr<=z9^Zfv^)ukOfu-vuVz2mIec9({0Fg=3sH9{vcTMw*LSV^; z88LT^s&=M}w!TrC!u$~jVi?~Y(Eg#EtrI)E3XGH&HNr&8nr;oJS<~V{m&kD*A-VQk zdhhkFBXDS2*{)0Y#(Py)UKs6?Ea5N2{U0#q-z#@5>q$W?W!~4;03Teyr?m7MVcqe9 zd+2GkhWMKH~aNosz5+`QmDeEISG z_^o=Rfl|h$E4RvW+6e*zV^{s)d`apS(Bge7;L86Km8mj^@>r=V)l}4i556Pd^J?U( zTV?cu&2`z=nK;Sow8~k))07V>IqX~~pS4FdnOQu&zz=yVL{~YN^O#b2X zd2x|s+>2zFz0z11ahoT%SB606b31CaypH@C;m;`wyn`Y_6NLEbT%K%eMys6wlc*#Y zHBoFRM#%7Bt|^6GSlnbgxO^RHBnF+pBdOcn3p#)c#@|6NXOPR88@IH`-|kX>`38! z{v{IA`$V)o>J`YT+h+jOlPF62I!Ybg=95X%uSK^~Z)5%kG-Y$K$hGVuISV$~Blc%iW%?v3KPX@$&&d}55lkr96 zK-ByDO7;6rrElp_kPPQq(M}hr|g9Z1?Q7qCkW=DnDXJ& zh(NJB{Ffm6ACS6#Pa6NH{CBU&|7&BRz(o1D4k0uH%L!@GV9xD$h}AdPpam=I#k|$2x=SK(yEGexm_?3iZ+@ihP0692s{E2Xx_Qx+9OLuL8f)egld?JiXMR%TJ$e(jsoyd-_% zp8}-Jp#|q8AEX1g8@IA_r>~H?+Y~GU`dC=la!$%ksXE}F{<``<8=6d>P+Y2P*6~2` z^C3<(8O;GuN>JSHd%5LUEnAn|*N><{&g4AR z|H4E6vo?S8I{wpN*#FAd@e{n10UE?vW}^-qcS_Q>y+_{UpkEoXh4(rOnW`L#C(LG5 zscXJ__Vbfet#9rW0mQ5DNs+qtlLq^INb)LiOQOrflZ8QT4j_D}(`42`Uf8i#!063N ztXgJ754BiZS6T_ruO@e?$vSP2;|T1MAnZ}Y$LkZH$;zt-5z!587StRx5@TTr^K4v_ zDjqr9AdRCZN*DQRPw^Tq_K_>ciOq^}3h{n?!AbYiT`dw<6hFUBlOB~UN+yTRMj*<} z9{RbS1o*(Cy*bJ<-ea4VWZoz&kx9p^f-g#O`7VK6!ve~fZnW1#W>{F-n!=w*;>uD* zQN%duMsZPY5HzmMk2)vnT1VW;yI9=LZoLwKie=W_=MSI+5AN=ocb?3?tc;^`@7!x|I`Ia%0G`Q{;!mOkA@WU z!U87t;lR0rdaQWP~oGNpGo%o8Kdj zgHg)L3JHL!C__H2>sn~R%5EB0k!;~2j=^V+2p8Zizuw>9*V$A)O3Mr{DBe>D6)qSI z>QwCR8DGibQOhawn0bSdWnn$SQWtIaXj0e7!IKw)-;KS=g7CqL4s>^_}fewLAm-71x0#eQ!^6_}wwtGXSS(It`Wg1uz)|1A(wxL*$WaFUFr8`=u~^gxAoQXzAWB!ex>?ui>=5U>8q}Cp1hOGGU1aM z-wSDM({Eu5hZu0<7Rg*-d+hBG5Kd?lvT#<$ZZ4LYBS}rSUWqSai_uvlH8q4P_>ujE zko$TQsGarx zS|0tq|BLmSN4YCZVo0~kuSD~q&3=n=OG=&Mpmw*V1nFNd?P)@U)yMn!cx8*6TR*o` z6>~2CA<+e3%=V*&-;VjE$<(Z-gSi*1KK{>&Kim4B`!7E6|Idws*l-aL4x?z1-=`-5 zl6?90-YH`{bHC(?#Uf6XF-04+;(Ym$SK?DC=Hn;?zl@SGu$nIX!IC)he2Ud9P?$)R z6YMcvro(z8yMRvygQeEvYt#b3e1a%IU>EX%7If?{=5zW#G7kYVAY|!$2(ZvjzZAe^5YfWZ~%fvisB9v zKGvaji~o0XPYFZX8`b&SVrxAYD#~q_3ViSuXA0%(>2?tc7XflpIq91Z^{3s z{NKx>kkeV#RBNADUKF;0y>!(g$NyAH*83JwcT7VPkDO#Ulim1DFU)nX%GJYuucNlY zy-K4+sKlP;Cz!tFMY_r`Z%?YZ4#TN;Ffv)2;p-z@C2E_Z$a}rf;BK(aAu%ag z3WX7+EFnuY_MM@uAz_5F??k96WX+O&pRpwSPS%oagX~Lo+4sTpyGEyT*E!WW=e~c> z{XF;c^v8K!*O{4K*X4U%pY8p5f8O8AJ#W^=Zv{XddxN0@8fsn?x%3>6UZP*l4HnUC z=FCm%Uhc@$uM81ECYBr33?H%lpeLm3STUGJ$|cy$SOzersjl3;+o1<=k)hMx-Sj-ALr={CvH0~IL4E96mGH>!R<;HLWKQz!8-T& z8ibX2e)A-F(1vd2j+6qch@2drr*eM8Xf-B_7woA_^M4Xx2{kzjxCgpqgY5@Tt3RDKTaCo67q@iDjw&MD_{Q(?isCF3hm#P4W zK>uwM^s*Ru(qe_GKR%IY@Z=gD+Rk#&KGh}G>_Iq!E&0n1C;o|)*WNQQU5>u&al$vp3qhsozw-a1F3&;z7cmUzWyf|2O0JO@%84O5r%3Z+3cT-_~!gEGAaZWp5mF!u^sTC zFW~_-l@QNCDcw5P6C)nke&^5m_tw1ME3E!^d``#TAk=9wN?@4@%bt=h7QQ#+u|~V} zdOar~09xtKxb!rv>M*s3nx34#Mer3Vk~sU}84RO9r0ja%ywxfY@hWWYe}3$^<2?4T z-XjkhPE=g*${Bg*7yj?p$n>U>s7*2OYEEr8$9uP90hMcxJ!A5d1q=u}( z?8%pr3Tczwt5n5n9bc*kt|A2m4Cb;l=Rd=(KviT~jL;DvBaH!XH&XzU|oQ0}9g(cJHg zGvFzChVu5jmVc7`vmY&w6L z^Oe&VXt)_I;j$!6^+3Ia~=D&jT4~Y@RdZlv@)Wxpf znt7fe>6~3B}qi?YrY%j4P9D@yH?eTF=?H& z9e}So08o!+u~W?#H5#$zQfXJVt4$i1@hrp5)-@6_0Nsi-pQc@D^%9gjtJ#;IG!b%G zqVa}x<7)LTnRAS*hSPQRdY;PHMRS-qXn5y%l0%O&kd8MBealMlzhODw{R}n8e@Y77 z<$LN1P_Z9!;AX7=fsTmLE9fbM#psb0$caY`gOHZ5+ZaH_ej7j&sqx}8Is3UOEFmcW zY^j6{3++QvS=>iWzr_XlrJ7}pTAiQMhg7JI$ll_O%%W;v_O;x3W!UsnL%iV%Zt~Ba zuP>e3=X1!xlo9?>wa%NmO;TAPRboO)&Cms9Ddz-@EC4R+h_AQzCTXVUlaBU}bRFVU@ z-?KrE7^Yy+h)_S0tK^PSKPx8c04$);D)(HcPGU zl)YQLrp`EI^v)wFYGaYaQE#|bLz0(C`_=hd5sxiJyxXBpkpV`BSk4wi_qyj`&~q;h zPU~rU{L((%eH3%+sfF<^ZbA}DH>C`b@fG-lYJkM?_dx@&x*Kyr`V?oz2Vy^9)Pps| z+o53Q&+qO(dpud{%BC`$rIFUuKM)0!(yklSb=UpvPg2i*U2ftY5Pm_&Xw5p9+M)ox z(-j@|#%T>13fPANfUBkT#r(16R8O&3P_pd0HHvG4gB2qlTJ+dPeh@&}(U;;Z1upwr z&i=i-XMbCEQ^YUoq{RygiFyUw60xB$jR=2I_T0kp1K+sduWpx+pYwkIK$n(lR>iCN zz8HMVC(h(%4|0!u5EI%=YyWlnkY3RF^0EoE>!WX3q z(|sbYc}o-zEhF$!^O<)#rIDhs=Gkpns?lHt>#g&25f-J4hEs7(r0ZG$E;k8D%(q}v z+B)6YY+5roH>=iRTgJ)MZjs+_IzP(zGAU!7HA7@Fv-#K)o1WaXcD<{~I0T)K{&mlj6u!Af*wVjJ#J6j+jCd4*LU7B*49%f4AX;C~|DfKRIzyLH@cyDLTs zYqslWaEXQI%T-j*4`p`(^s=vW@h?+qu->$F))ovAU^hs)0U&zpebpEkrWN&^ZyMLF z0B8@m(nHDuDKjqyUeM;Ed7d8RaGs#zlmJU?YFdF_rn^_fBhU?iIsE-=e0jZ}=l}oG z`LEV8AfbUa905vUqP`?7L<`SVdvXW9%4Xno0g{{>(VD)V%JfF{7vdymwOLOVx&Sh5 zU*-42zR4G4Kj)Kw^LYH??Z>fG_tcT*;Q*!sxzyuwPaYb*&%Tlq8~Fxwj0;)Hy5~rT zeUuacQpZbAb^ffO%&^sl8!i~I~gg0-r9dOh20s3R^VJg1+u=#CneLb@WdEaWw z^?nL`84KE+`L^IAT$3P&RHphDqjC(Xsqu$E#IipgcxSr|#46H5e7Ou^!YJ$JtxikL zBPQLE2Q7k>M&+S>IItWjRG^G%8oyM(LUQ#-#A$)VF5KDA;z9GUtf%LS*TC@aGJC(5 zOZ@!&?>1Wg!tY54Z#>35&5*0g9^if!1>}v+VCWyed8|Z!Yp6r?nf!L-m?(4e)Eh}ruBdHffx=bfUejM5HYI3s$W&7 z*bj_j(BaKbTMgu)p}OhBDV%=+#{GOXt(iMD6>t`dOo|lk7zar=DPGwKfAV~F`}tDt z*-IH;+XjH80cOmpS?UhV&3K+(E8kZ*lF^PC@J!TZV2va`Km*yngO!b8R73?F(&Y7Yb1{n|M-V1ds zu!-c~Dc+l^kXq)iJ#?ohB&Vbxg!m(fKFeRIzVAQ+Q=>Ub>$vtp!vbYs8y)R-;S*0u z5<(YrrVb|Q(iFI-E@bB;=nW=ybjg!ovnmXh6KBf^hGUp>dbx(AiLbGqe&6`KwRy|1 z!n;QA-1B8-5nU z+WB9w+rJss{@FFV0E$(8=N*6$(slRMSj5A5O%{M|g`$4fV~7XPO@4r~;B(*#*GZ&? z_n{1i0peJ?r2`vUj->RkP%ZZ=d8Ai=a(5r~l$-WCy}El}Sle9v0r(NBMcez1Z7LJJKJ zHPhhICZ%=&Fes(lC5DahD4uuw@R{JB6qp=QTRu6A6kBko(c8LjTqA@5v!=mKf~f{0 z%9v81isaV4bfWc2w3kC#5@s%hJrt2E30-L{H+ttF>X+qY!6gG^6_Qu6H_MON= zW%S>h^gPd^_f|ChgL`5b>9$`jCclp%&pf?~00ckba$eu`p^TTJAyiyg_l% zjiuN3+WqcGLY=QUmoMiHGOC1+4LZB#Z9N^618~(VGe28Xzf0i0%S-&P|J~sha`&5a zGjlL=@x&<3ft(?%Z5{Mb*(Iy8U1XcUaeBzTW0!>C(kX0roU}kiP>}C&Vy3?^H%Nhx zKukQ3pjQBC1QnI=o_?|58tav?Z+#0+Jf91%?_-F_rw0qw7F)E_xt@9g?NKO!p~_;t z4iYpnnZy3td+t*#y>P4l`1)ni6g8c~sp{!3vh)SGmf&}i^zWHAkhPxzz%l}ntbRT< zhA=`LA)LX?qW`vc+p;{g^f%uI5%TS45Q=5P&;NucLY zi@;;UC+rABDvWPr+G~s7yhvVV7yN;@_dUwdr6 zGGYGrlDMCrf8AF3H>`&g&y@x#G8oCpktO)vg==-*Z5upzTI`2R&<2aKq<@%$G;VvzAJ67jmNO`=d?x8ETX)2TQL^3!uWl zrStNeyr0Gse(0z9V1p+}r!MQirx&VK`4ADg7-7tr#O#o3w97oaa-iI$j)AXxy~yBn zr?xi36YXGchN%odhf6cf`!#Cp>4gsA;pw|$-010@vb=pzDaOT zA?M((>vm=9p@@JhFI4$N^PA_QHT4sM3FsA$f_> zJyZY09|i>@GJ6iWPva+=+yY3!pu&#CbpecK>oBzex8szGtAPpS>!g0eoK-cw89n{w zX!+ak;6XV}7p>SECAyTb_eCL7^^$9Q6c>3@r^NSSK7y3ctF&y1<(-rGU!v6&`Xgbe zXnL)2E?A26J)q8z5yc$KVa4S^nr$CJWISmdu8hXi&4o_)V#Op$&RXMP4pF=;J)c{? z_-^e*l;yPm%VsQI(S+!x!kI?w%r0^rJj1ADm?A*+EaLv&+Z`NW0U$2Yq^Ckp zK5=Fki~D1L=~GC?!2{dMMCSV>s?>(E0a9bF0v&dvp+D&q<4T}$_fuO1uKa14_Wk;* z#l)}Gnyw7?_qRES`rFYw<-XphU~{MDl+U4n#^(fQD$RP<&DLA=rc_FroT4H3rwjuS zBHrt-a`f4%&%zYVcDZ-;jl3Bj+8Q*|_o_}hWoNU*PHKib9m^|w7A}&Sn26X>&q@@r z1>ETkTp$L|4Igwh-SpwH&{N$6c_b-Loe?7*Fmx_brMSBwutMvqdig@~^AZ`O8`E=L z(gK8dqeKH;6GKNyyZVAl( z`q{wgQjx?XYKTyK0L53Q{xbUi**RGC(@=_bwo*gv#ST}&N?4t*%dVxiX~}Kp-07sb zty`&Tk7;;fg0g7A+*Il_l8a5lJ^&nV8Fn{mk34om!)#rIoJp>q?$#x&YF8UJ*L`(wI}uVU19X8%55Eb$yd>Rz_|b<>#utITv&F}4UY_}1vJ$_x~cn?O4 zo1Rp+t+(#nAINx^X46UVNnQmCs8ik#L5Hj-qYi%3S9wU2h_%I2QHhTKP2?>30@*l9ytQ~#;j?cdp)!{$B3HW8jP9nwse1JiA15L((t)3bs6X|#}cCU8nAu^jtYbM8=9)xdg4nul6RJiYt*nV zJx?`4Qg;J`8w^hS#>5UXuA>nMw#)#rPm!i)snsuMSIV6;f9<9BPVYO{APTivH$T?( z)DX+2yN^@RhS|MIORKdOW5P~HO*hRp>SU~?NBwT@O7G_BOK9l(uE{A;%&1&@m+`K= ztY|1GP(ySoAS^>KhF0$_->lm$;$PO`_8wFNfDOEx|7Sskpa1T^7T}>opRXl{Jj)He zydp$fCS85L<~TJTR7NeR26b1K?#9wknBFx6BWyht|4P$HC#5?Aaok~PfPXdPkEtW1 zpwrkZpv&L^jAaAKf*Vc)Q6IfQ>|A*OL=cbMK)o_r0~`J+l7C6*vGcwBrT7AIi6Qyi zXCAh$joWxzJ${-Qt~R03F$`yyj%x0yd&sWz?YM?V`5rk}G5p@DZTn4wYggYq5c@g0 zFNuGVa8M|2ol3u;4;Qp4dc|~hTKE_p|0F3EHrF-6c#zffvbjkfR!fSBF}1@Z3bARG z&%X(E)pJsi0NZu3V%+`OCh{f|EZ^ioHFuPDM6+;WwF-`n@k&sM4WBi*zOXQ$Ed&Zj z1in?$?|9mxv+>kFO~Sy*aDjc(CR>yIGn%T9X_0#i&E$kXR`UP+P^5#H_b!g{XFx)6-M z5NhUV=D7HZ`KW_YaNRn~CQlJnv&~(W%?mtvuj|KyO7j?!J2|H2dcw?fKe$IfHAXK{ zmz=879L5sk7L4i>iaNOD@o=%6GktqQp>#;{fQz;Jb*Y@Z8+SZX3{+spFIAc@m}rD@ zT`A4#5#ki{77b84rurG%*%nXBiQJ(}fTGndo@siNE&T=fnV3L%(&@`DyCgfAHd{N8 z7mv=$B&TwO6x3iT1G6MMF4`GMnfh~Q#o|fK#8{RtYJ?NgTe}yqfoyRjC-+)! zyTM`mQz*RDzD$Y0Ac|`H*R4jd-?S~!KMbd_HbW%0%%tEqkJ5!FivaO{Xup3~Ba@Hv&uqwY} zvCm<_+WFWEMaG2N+%l(j3jvLmt_22doQf@Mijk0~K^<);#mEt4#W>o`*yRCPm)&}{ zfXqOc*~++AWvpGd6)-)6nN@+*ZYfR3c+Y4~=bV|cdyL_b0Bt0{s~Thsqd7>;h8oiH zuvNur@Y;K_t7u++utzZ^vthf5lj0Rg&5X8N=ZU+;IJC!xPxe9dVU?gh@XK&GK5TgMUG!P~`GQkSj8M50riArA~fkY2NgfAtK|E zjfXM+YX8dZ8o*W8i|l?0(1>3kS|OIZTL;12 z10O+@jU6f1R9sBvl>S810;$7yv(cbl%cuZo)jSjs;32W)C=~-#G75;0OWI2r)BB}3 zWso<}6$yaqrCX|T6x&hbfy?Gy8GN(40Up9;yl~@z2){pVd;S|hv_C79{;z)z)-%1k z%Bnb)sUmiK-SBwsN|j;{@udY*G` znWnxU7}Oe_yXtjPdNup0&GV@z@gw$^L)J1OW9;o)k<&L`ypiPJQt2$~$E*p*BDNh= zmqp4lWGWZ0RatEq#Uv^1OxzKbP1s9t1BCQ@;x_j_f*Oj4j2VK2-z}v`R?H%>aZ8Qe zykyM=d;zJI2A#Wd*=8Nv;ZY6oemhx0(s7&3M?_`kZIuw4?ZpcAU0g;pll}ey#Ko+{ zYpApcwUGR=~ngNcsctWDrDKTsY^t z2T9_#OpcPZrxem*96g3XTQKu$@E}qE73^efdtqs6JTf-Iwc{u`eDV@etx%94hz&on zdTM`?7B{L7h}|7Ce4KO&3aKTee1_O=ZHQ$Isg3$j=xFt#sP*<#%&VC=6#zzCpPRSW;MQo#I(IRAa8*sP5{qLI4&Y#o0T+G+s_;a_=|j_vjU^jv&d|uE82O9 z^U#LeuLab)bUn(`iZd}g4_+y-LkBNT_T9C2@u5~xKZz?HQCb%M4QU=kVNo>2kuTXk z09z`|!a2+V5QMp?@Zm(PL;>@a;Ksg+*3n^as@lTIg*BF_%w}Kapd%kacNiyKdJc%v zAEFnkdibF#+fR{(?CjOd!dcr`#*Wc;TNhoC;RW#s&cO0PId|4G`GBx6xvEV_Xpu>0 z0|bJTqYMJ>W|A~mX8OfrWWAFxgu3T>KuVLxDB%P$X&=^k;Y(A-U ze9PC0%Dhuw`kI)CoMY>5-ecQcCA9@NAM3GYh6X{)R+H1sB30VMTg+A4VBUH7j&7BC zg`^SlnmXNCqjo9y+eE}tw6%lDblZKidU4WM01_sR^J)0lhp1swXD_{(ko=1|db96$ zGKB(Hxb-ErPSo31LOR|op9^s~dUOomCN~$&Fzdoto6ftcVD6}|I73ZP}y_FuLRr_bU5g16Bj)$Fx;KLOA*~X=KXrY~cxmbGN2u0~H$o%{| zX+ZV3u_K8kq?Kpg=ack6;FthFIDtDi%(p9OU+Z#aOH;X%t83A^%s-);$p-94kk z875iwJkrPY+|p1?>~K#)i~d2ys=9&&*M5)GVY4?H*bW!#OK%>C?P6fUb9|mciK2Q^&Zj zp_t%oiFs%Ljw9fByp=WD-BW&{)tq5x?&6dY ztrMxP4>FHSuoF@~k#g2rYkkB(5~5TRt=(b3X~cfchL+sGsH++~vpBOhGikTlWhNV1 za??G*yS}qQ6=qh!II;s!eK;DccMd>)8k1_x&?l4~$kCvwH)xi_boOEg`A6>1V2Sj8 z(MwpK9j`1%>GQ6$pFhR8$VZIzGH;%H+0Yu5w<0Pvih+b5IG33XbV_j^SH~~^%dEmh zvnJ;H!mhAL+*zNf*YYP>gHBkB>b|VNFE<%+O?FIN8NPHqOGXrC5V%LuQ*xF~$Yf4$ zgVYrb!!Smhpuu#u+xGSz?jJ$^uRU=ZEoT9Z7T9ypG&j_aK>(0_nw80b!GLBjz?T1! zt$P7u%5F}`J+oE58H)I^@IeP@VPc*-Eofi7GXSmaDKi|jVfbE4_HjZe3Z(w{uuD&beX+eG6r1f@XlC~tS0dVBiyaLBsIjV{}=c_u3kwfv~AJkOgt z9E`W7JqXn2X1C_YVp*@hmtnRfE%l-pF+aLl9ecc-r8%jegF)uOu~w`g)G4ny#lu?P zX`Sa1lFS_&t<=05Q+06?DK2)B>eVHN$;yY`qjC2~t(#NJj@bB>hBb~~>t2W)9Xug- zqt89k%<>~>SlV~7HA~)lHgL?eFO7OMcyEAZ?{<}jBcq?|!7OvdtPLAuxY9+P3_a0a zy|guk^?7&FrNZb@l!@G>&B0516K_0eJgf{pa9EG>Rl3mnpD^;`sPsKTZKsttxq4`h zGGO!cLgR})7pxw1mJJxKZ)?|O4gT^4`qnvkpW#>v%g}~XsRJrAxW`$7vID;uy*Q`( z4`r)gp9-F=`nz%adAWlh?zk|JT`N|26@-Qf;u6V(C&wBVl7O5O(5d;uDF6RdOZ~ri zUlin9i&WICvo*pfQ>1Fb_qKl zC}^tPxKnQt!rVD&wf74E<-OiK&s_kXbhoR6sV{fPCCaVui39ts)Yks6(z)$l@JkvX z0N3t83(C^SdP<^{r2+8chjQ5Z;>po+M_(2D0T*8?JSy@8-jc9BkE26lXmy zW5Ib)@R|w&p`uICJgXjdy)^Pe&}U0Wc!m@3$!8&82@4*yT0`BuIN(5!aN3A&hjk8D#BxpPQz*!l5@ypPhm({q+`8q1yeyQ;zg%&-+but z(^npmBVZ57_5ynN-J_9aoHfF_nj}>k)1rbH{KnBmpAXALGQPleM6WJ-OkeF0-&n3a zl1Z6-HwRBe4CF>+rouC3Bqc@CD<7(B%b2j&5UF#t{WOrg%J*q2%A8o(9}o0rk~UorAOxdL_E z6uLrAu`LgY+rC3kUxyw;;S0e=t^l6schm1*ym~4@MFJuPaC_Y30gy7Y7U+~%$>yn4 zml-jnMFs!KNYBrd`iDW}uXy}_*KfS=2pN0tHg&N%aVVKR}K3k2kY@B*NH zuV!d5P2s*a`(x>ooAZSiXC2eJ%u}0**5XMZ6Q2ZAkX(}l-8pLCR4&jck>?v7&Js_~ zAhZKJp#^`{On`FJgn6?uS8!?;xE}GFh17pj!tm=Lq3j4q8_m3^aT7>?VtGT@)PA52 zgDymca{@LQR9*i$+y2ap1SVh-=y%l8<4M7WT`Q!qgX~Y){RRYS{c&cYM(K>eBg7Y^ z{d(^86T;wK-2&18MAr0WAJW*|o0Fh_qm(5Fui~~+WdHW$`R8Qj=kdRXxBaiIJD|ip z4d1s4R2R|fn+NDi+Xw*j7+K?m=gqDuRNyTR-2;TdzE9L=AX2d@dAN?!B*Pg8(zZ?s zUSc>Db}RCe?$+8@&ib>H2W!oommuGRw=kz0p9I(+C!YYqgL;yO;se&9_Xxew>fNng z>YKoe{ijo?NyQeqK?BRGFzxW>_Z^<6P|vyEHD*jA8_J%MmdOJw;Xg`?;$qY`3{nNN zR3qCD7^o%njq%R`W8#!2bHX1b)>Q`h-Zd>aWAoe>O$89= zZ&}ihF}ok(c|X7VZ(cj*liqCrB~it$ua2SX^ganD&NdRy&$2*kU+PL^jkwC$?mKQ< z87ICUPg=zJv@u1BcHs{Du?3dcbz$U+_7Wh;@q17G*<%AoBA4XxOBh;Dy2DZ`nvoLh z#0ki=q`AcDsZ1+;>x8OZ<|efk!erOKU)TVaig#>YRSBM`D8rh80wlK@vtYnAk?U43 zGu3d~{NI^Vj%HdEXGbd&5!9fL=G3UIQ#Xg!TGEqAX2R~#XFRBYl1hN2B)%5u!n;Hm z+jS*ChP6Vgh z((S{4DvKS36S`%($NJJ*z*l}cJZlZUYS?z7dc@z{T)@DPqC4? z5Y5`*2^rXLjs*Xz;STS*lTu;!AnFIG9Nr7AHKq&(3SM{QKHk=*a@D)=}^6IZ84=a%}~yB_$TC}!XQQT&6+)9E;jNgK9mpV++R~AB)Vd~pQ8yA-aPC~3rf&5bOY-8b%h@D!!GLK ziv>&0w1%ADicS|a0IX>?>>3Ua{sei+Ls7v4q3)**H7(MWQj`HdQLV2BFCHH&IGG18Rvjwf zWg-Jql>fqn|4+mDk1?D-I6puN5Lb)@*$YtbIP`4+d&yWw>ei<33t`LSi1|C6Gru`{ zz`4KlSqHQ~ZT8b0&*R-WO!NEBeF2c%g%5sHLG)XRCn1pF4G0DlXylVL)gUi$H8Mp! zy1QTaxTzPp<#@MsmHrMe!)*ezzgvzj4p!wvPBc5KgRP1BkF8(sq&)q#t$#Du1xbT~ z?+`mrtmjbGGfdxBm)|?&w-A_?WwMo|Yo5W-(Y}1UT!>=Z)T4I0hhP7fnB}WU{1Sv4 zL;4d~4+oq6nu&}Q!C{B(aLYwY%T~mBB3g3$1jPPmT~7tts*N0jK2U&b` zWxn-*et!0!wN@ZWNiv)fgPjvf&#V9YU= zJNf|e+>o>Vi*gOD0c;?$hPpa(#F;d{ktJlv^C<7Y2y8Y7*kofDQvauNIbiRxy{o_%nI+0(avkU9sj+@x=7f#o4Ny>VDulbFFMY zA-2y|>;J;z;n62`yz0lA}xFmJSf@J=uQI;&JSHq8Tbtt8n5vu3}2-=epTypbTu_ zUgxxqr2LKP^cP+Cr?`qhX@O;7IO3i>G+p1BLZv3H>k^~UYTFWg;x}B8Qm2rLp+GgI zcwab(Wq)EXl?g3ZcwVS8{m;`<{PM(ECB8{B(aI0QkvW6Sjh>l|AY7M8p==z*qI&2N8yl4PEHLn+H9sO;U~Ym$F4lJqS!b zV7zS)+=<7G(qKaL^=qDD=H)z?KY}_EodhZ>9$O8snV3UW1^~MD+ox|l78&1GNr|{m z^g-f)Nt&UQ)#0ACc;iw?da;Mq>!d#=BE{RM&L2UJ?zW(Om7v+5tyugo+~$AgImd}Q=hrh& z2p^Hzo!pk%l*-6<)qsQ_3CX(N$JP3x|v@v(=*eO(J6my{l+=U*xe*@`3%Q#R$8lI`tXk+e#~vRqm1WLpn9?& zGz{lUUKzTCM9H${#9CtVimfj)ACQF9Ca8I9b-SOc^geht&h&xDp%@R5f-8a0z}mZW`>HvKF!dcJxOvcGp4;Xa5<|DAV#2GC4`NJ-Sl}aod&5*&kwCXdxA3-%MJAnizp6)QZ$L8+(1I@0= ze6zG$O%9AGt$&F6QXG<=rx-?su!B)AiQB4&U8Qpq#wel{JCF`9luc)4Kzn!uOW8tKI~B7s8gFaz%{=51=Cjw|qCAGNT+30) z>2Q?7gB&w6yj2i8^eCqVD-$Oi)vcDEk(0MwwX!Gs5p?S#$b2n%N65nw7R52AbQWnb zj8=7`0H`^DM5fxxP3+>F3{J>QH_o4ItBj6x_A%9s$ zK{x1Rf4-l6Qlr@okm1$`*J%P0ivf1r2M^LLG-}R%1if;oxx%Mg8l!EN_ZL{?3MN1suzM;v@<(4&qvyh1cG zec-6drh!s8FXr630I!eERkNA0%OCWNl*qR=USNxa%{8_XlHO5*6BW{>iHH60MR%rS zJwWDBC0ixwxuUl^=9%dq2VeXMI;$Dgu47$?c2hctppL}B!&NnI( z72c?j2A@#$PE~+iUQbSCx`mU_c^FZosvuxwKb@MQZ3FDps!ZezQysRY(&(nv>&2i$ zbx2VJ48>qbbyBv19Q4!!BuzXp1+-6~CV0H8=+)k|87@Ok$orZSN+zh=gob71L5FgW=G{OO;z+h^*CkeslgU1)kJq#0I!3ZOiN-h3gjMY72$v!|WWH&UxD%;&2nV(wer`-_d5JzyV@r0_%TxYHs)&K&+`EOa{&(Hp6tLgmJ=X1$}UO+IJz1M)y zo#i9Q$HuP1afYPplzR&a!4m7Pc{HGBI`tNwBjt2yJK`qCPZ%}0Cu1QnE@Nd{!4fZdJm|raFW=!O2 zS1<3uE~h!Q&S5m$W1TD!sHv)=38}sl?WXu$VC{M0out(?x3s;uZco<4_#H9qOv;Y9 zGcQO13Y|*J*v)VSz`hEMGGVCNWi)mvWhib)|E(uI^yCd^O1VD*&_<4Bdfv5BEIpl| z1rh0_R!pNX@w!NLcZ;KD{k(AjBULnu(S{4&#JzV-)kj5}+z-6SQtNG)Vi2NQcUF{h zNW3&JxiO_$8MT6u;C$0kGJI^8IG*4hY&x+#En|hVaN(0%0;;1pE1tq0N+kAkM;r zH5*O>BUGD8Q;oYMJ*5|^0QtlmRUaj@6BVtQ&C6UBCz|b&IP;q-R@c&_fJq0E#_squ zNS<4pf~7qIrwJyX5k45GN+kRI2r(P0;4wYXEe>sdN% z*95dL;Q?9}uuw0yG46U=$yh`@X)r?N@r^97LE+HE0U{aV^o%Th!i>eHlu|_r@?Bcb zBP7h}D3&TH({p1&hwtxY3;Nx)#y*r|UW2mki z&W?rdYo{4m9_9B$=xNKC*qtt~JtBF>ZEtQhfUCOnnhyfe;VcTnny;9&8N)&|O0Dd+ z0R}L5g&6sdqD8G&TgJ(bzMPqBit)-o*UKXrr%5;p&!AzwU0Ymrd%oJfmsU@}cK~(e zo-?J4d2x#ruj5QC!wYk-&hCl>1vI$Su)sY$sm2b+fLLR(Nh>Kk5*SX_5Z1I$g%U_~o@}pOZX<5P zL+jE09X&ze$|jI}k1X=KFX;H0O1<)!g&Zy}W1-|I$)LatLnS~U%kHed;2gbc8X!fe z;M&TF$;Xh6HrPt}#K>Jcr{87=%w9`<#Vf*#y})pWgDK{9Z7v<4`Pb=u`BD0e{Zqu9 zkh&Lm>&J5f#7Zo#r;AKZ3&SZwzHIyj5VGKh1b`nYAMMZ^c1$qbFsq2(JQ%nQOLcNF5gS@WHLiPaK_j!Z1HvGOS zn5c>d#4A?=pntnb-vEY^{?qK^=l%Xo-Y5@P2jysF z+B0*Gn#pEn$S9sjdX?fo7L*huEBFk|OsK}uVdsCeUrncpPm%aYj(#Ir^F;cVGxkj| z)9QQdt`62H%iSn)X@P@?m5s~MK_7_?T64VZCiuFgH=*?Y4jub+4LT`-kKAwdE{DZRNDv~QrxpkQfjd zy}qQT{SiR{h7ARo6i8qjkV<|K{^6gK&K2!h<*%fkY_XsK;yLt;wER(l*c$TJImv&p zr#>^KD1aG^Er^RXFoRLB0?c5XF#mN11DHYn3n>2IK?HyP_J1VBkR-Lx%VWK*k#*-a znFhA0GcPXp3ZNoMS^+h>EDQ-)^>WWRic}Ol)09x?MX`Rqk#Pq}BnJg28zyFf4YjRs z1+TK#-+i_vi*#FeFUuvof|I78P2Lne2`qhI)D}Qy%cB~q9u^5^Z%EcTohJe-@zU0Q zXt7{qn)VPObHYx8{!04X(5}bGgUd2BeOaXav$@$vB{q#22R)`;TzdRwqbw#voHwHU z%fnZ5mWoFLm7OUUIGRz=PIswSV@o!*vaPW8k04#yP!;G+YXHg>BZ6=UA(_JMPr<}q zu{;9_&uVkXMeigRzVMQJ^#BH(~AJA}h5N zh-&(p;GJTbF=NyDt|(xSU_cg>WZ_Zij2PPT`Rdwh@>g-U!H^fwPiz1 zYs7_4CL$pI_&-=Qk|~SaIJJ7UjBM?50WXvM9WDW*{YCp*|HypN6UQZVd@$U^vAVf9 zQs+!h6TmhO?_DWVQc;7k^&5F>N-_N$75u?E|L7U~pZYF+7Ng*sYq8;`*v%1<_k`YB z9pjB=ysMd>fLdUO%3RN)INyam(%19`BbmTi=|uUq(lGFbB*j$Jq{)`{*rkC7Z{r{v z96}Ur={a_Z&%r=8X>lwTg~C`uURcI8@TE+pxE&yXYJ0p)Es)BlA@H46o|IzujR z|0>byZjza39HJ|2u?yIAV94s`_@oE+|0|i|5T$y3yVT-&HPZrqx{c<{y#~Jf5eP_uE{^QCaPgY0-|A` z_Q(P&Mm)6Wv5ouy8I-atGa@dTO#UY7gfGx(>uN{WCD?|vIn&6TJFZ44T(%(ZK&M`y zNYi^#IPgIB9|sJ-%te2m|L4ub{`Wq!{h<&s@IO+L`?wuCV!ND+M%W+ZenuO9e~;># zzm!}ML;x~zt*9p=(~aq4DA_VVwJ@Viq*q|tF_mfXFR*Wa@=5=MOZf+a#Q)LXE9>%6 zpUYBGPqA2`>MsECT_8~tFGk%UJi2d*v$&61^xSq@aLO?K6Wj%0 zXr~+WrN%$@qL2VY^qtQPd2TMM58CGv7s%Y+a!X4{W$OCrQh$yr{Hq~B8mEn_iso1F zIXEWD3ddzk z%nx7ikO9l%mnw_TA?iN}tAGn4@yMK)6?0i?!qN&pm-@Q!aMkN7lZ#R>nx9dh1QzUH zpt<}ZhyI=%`u}bGQjdZ8S_!#rpT(;P@n_8sP5In6=1$6SK`78q;rp0#^GcZm0g|`k zpV;3NPjLN0bCh5T~qgU|7fimWt{U;tH2|5{w_KlO89s#2mlbh*Y1 z5Htl{zztHn2mzFuyldpE{z)c@xhTrKviW$b$CWq zdD7fSgz=M7pfKbEEghWO?d2Rw^8@rfbHgz1Cy*~PG2ufbWk94#p&BFE^%UWD_?>aB zQ6I|!{V3Y~4^!3A9>u;3%!+Xn4o2y2xwQqQb%iC-!i0O5S#Lyd!=^ozjw|_&PADTz zccQG}iQA1_&5ty9C65tzV8Oy^sP5%*x$vZtWmgNuvhd z7Fg&C4lFr!Hmbn0+Ll*hPbYoIOu@s$Wo_jb`->t8xAgQe4j}Bt;ACG}BtTQ116Wh0 z1QHN1{;N5sq_nJ`g3+IGN6NjV$D9wQm^@`T%MfDEp6#@_#|~d^;s1zr_Ad%2dZIkd zhqx!D1#_HKB}^_rC8QZaxa$)SlXzpb6Lf@cKgo*7w%}a1BIE^d#`c}gOP;KJ#ZYc1 zpd*t_op{0>q@>O?;?a)NS;3M=6cgIabu!Y#*MP=tq*INRu33obBX{V%L)~o7cN3c} zSaVD}`>w)|xcwonJ#gODjs!#~T<5U8^(C31SVJ(gft@r}0=6j`I|PqF6f0aI?Ill{ zx}m-FhEruG=TW3eV%jHAD=#b|Xav7q=z)4Bkn#(7wv|~`c6P9>@eF5#fi=PrWhc1& zjtQ=>Wi#|FilaD3*h$ODomMTMHOKfuREQseC`s{@Frk3;RevrgzL3z%5sjcEx$4)S zKqkFj^b=!-j^W^$yG=FVyLP9x!3)ZHd93M+FXCPs7`~@INHt9h?p^w0DBS;LczMbb z42DTuP}7EdrNz|PW!aHeua5@PaN=oS(9R61cetTnxg*Tk+;f_%#NRR<79w_4ooL&O zz3BD=-iS`0)*(9z9*u(!6oNKCfz~eyUs1oVXBln0j4vl|G2tB)yPwP(*1*?22Yc%W z;GwjqK93{69}nFu@gP9n;m%lhjj|*|MY^Bi{Q~XlXVr5r(2D?sov>qhpqq8hG{ePM z+Q3Pg*q@cTR+$%Dy-OEoAhuqY#_wbFp%{8;O41BOeqcg(gzpjP7@J2p(yhBZ=H$Ju z;C!v2{P9qSgEfGInoLUheO};?{)bBU+)s26=qnF%7$a{~0$5)kmGB5w^bG@9*|jou zw_t-y$Gny-QSfpxcWCa@`$oHqs<{tij=LGCzM$Kz72Hqrx%`9r^84!Nw-iE>=q+S< zkar|m*}B)B`OX=3ocRR88|PEKU%!-P-fXwuBp{{r!P$6TzGV1@l;LRWfEc03%O9+o3m*x)@zVfuX(s_28y=v{3wdttmyti7flAWJf{ad!QZ0MhG3~cTdM0ARag3Yb zd1#l1KzzzZ$rGR>pz;HP$z0ka@3*fvkbT-*(}%)!DjruxSs?}UU8;(w1o6T98{5Y= z{`V)}z%6@+@6Mx1dU^viw@<&M^GP)6d)i4jka>Ha4AOFkqBB*6SV=jfU?>$~?!o_8Je3^7p*W8)V z*5*jFoC&GaNvN;7+z5vyA_E_92+d>FIWS#fL8Gb(_k@LCkMw$IPE(6~nAfVt;vK&W>-dGdie0gq{ww1D5iF;S7%$XPtX*c@JZj|G+AkPVyO3WM% zO83wJpJ7YjhO+^+|LBd0iICjyck)6ta* z&p%qNAmH?`UncOb%#usf-))iX58k*IXvb$7t4|%}nHz>dzjroxJ9pZ3I-_etBQblh zFPfW;G<-Em323Xu{zo$L$9p6oC?S!&Wf7a0ZDOM}J)S#r{`S*pjSw_-@3h{V{%)Y2 z#v?&_C|ns;(AdUwBh7cv;F(l53t>(sf^5$LkRVvUSP&AFQU0R_F;5aPO^K270ju~E zfyfH~=9u9ds9v>aMlNDlmQy+x&y#Ts-^%c=fmjR_Qity#5zUlqcmE8!hxxMv{J(a5 zGcXfD`%&kr{M&`>X(brCRX19@y8b=ak}iI=s&e-TVptI)GHP~ay$NMedNn84AeC`q zt8>1gwml@`qsK0UM129L^)~@42XDfQW*%mlE99p>&My$>Cn7}mII@a69$rOgczqbg zjQ<3(Y-(Jf+$s=wo0c7z%Q%GQqntAA&|-ELw2I=OKAnGg%>A5{WhEsN(b186;VciD zUgq)>Z_-7D8!JNr7krOkGa{g8N4bzXXQkWhuhA6Zl(5gm99KQnw=+r4j8Z&T4NK`( zJWaNTn-iUN&44sdTq26{3V!mrlIb{fNq&IP-3qQ*h2@Z@um) zZQ(=VXe>4*Xh7|1TH;knN8x861&G+ybDcd87TnQM7?((DS!kz8CR^OtQ41J=feOlg zZq9DTr_*;dW9c`3o!&6FQGeA_NsQ`t!%nVU;6r2~nr!(OA0Nx;KPMxsDQsXr1dcF3AM-8&xh^E>yoq`vQZEfvo+9 zOvXt?-S`B0!aZ>0NqdCBls;98@-3)d2Fm;vPD9+Hgactu+Jwg$m)t8!lz>ixseeh7 z{}r+S|H-vdE#xvUPDs__RI-s^Y<@swmqg}`o2?~|$aLZDVy!0E-$ipPS4z&v`!w4U zR9MAJVp1j&1np5CctPj-W}!h02}0MwqMdn*N8`@`DUkDP9dKc`Mpd@Awc&Z^QK^gA zP8}+Hc*$?`2Qob9)(<=EFP93q22(Xyd1}vI_rikJA3%G81ft|#^isR!B`ohd3c8US zO7(N=1g9`j+nkVpPG&@89>ByvQwr`SGVjx3pJ+7|H;u3x<%yRWS zQe6Z3vu`X9g7{ zyxc1f5U_V5L~X)0_ANpYiS^#+b+eaTwMqzL_ta;Bhc|Ero_+Cibe1M)eFQBspa)C= z5<3kRII(%M{1XU~{t1+F3B06;rVtWm2WXL!b61f{5VxRrzx?PQL85O)w}~dSYCTQ3 z;iDE;&M`|$P!U#dPDu~VTGD2O6c7ZLA6P^PqT26L*r5j4YlCUN8^n_p7dBNFTLrrki`pGm4ptQ6-JFSF2SdQ|V(rq}d;jPeK z`<2^j%=utp=8IW(0%={aWOnBFgzPgzA+DXoc|bJ8o9N;%UDb$l&c ze;t{trfL`I=e5e%yf#SAz9DAa1S z(fA1@XoV3f%*hGRuk@m**LM}VjqbG0lMX!Ib-qd;auBfU*)bKooM>qv$Z|(*7J=ZN zo!$!G3uvkAn?sve;+_{^J=i7QFGRXYm~Y;vLCPAQ2g+JxhRN#{zM1AbfzSq?<`Nl|Fl&aQD$c z(i&j!<#?F>#b*cor(UmmQ>|n-+|}6z!sR^{jz!p?mF8o9nAq3h>v7#?$n)Zuy!~Ms zPyn{TpkzI%fLAjlcX14M#LdQtejw*ir0UMI!+uaqIT7yJlu>eHC*1Ot9}|(IMkmt| zb95?{5oTz+E2nj%xa6{*+_L{}fUK$#&|4C&BSDZ7eDcdAC26|uhvDYUO@@#Hhj z?TanT6ZGRgSCtguHNq|inQHjEQe1&we#>uMt^b2c6l@hsKnRzlw6yVL>3h2xa%iu* z_^>lA{)|-!%@OlI_ap$e&>!M$!PCj40_S(sIhV^-FTpSI7o>|uo;$Z&gI-))a9xlW zEQsg8ygZ75ZHQB8s2gG-8U1X?ODGHTWj=m+!NUH<;2lCT6j$&{(pufP&YSs|V(Xa8 zK)LZJ(CM8;B!a2;`Bh|VR!J{gW=Wzq-s6x%|4x{#F17i&v=s)gN4{-2hUsn!y7_tN zis+oQ;QBROCb%GuNFVoNy%sd_6;{7C@!2E2lmo||tjNfhmmTTluc|o7mJND*>A@8o z){bMqZgt-#jkV^kk09562ZAtQ_=ZOa>1GA*Fm)#0jTAaZ8Q1yDRp#o_ygeWrT(|Up zcy)ix*O6MHmf|W>JaHcA(6Yq5E?xM-YcV$)<~Hj&1!V@-UutP2z1b3`H^ryj>olA1+W5AxEanWa=8J9oA3;z#G>SJv!>i;O+9VREcxj#P8 z^+tUO-ZHhdEJ9FhC4=TR$dwo|I$6+(@mJR=)Jl)7$N_91G5Da5zPULIGcD#Wj$5Q+ z&;4RnL+czZQAZ%&7h^9$S7wO)A0;PVaU@{0Ji{UYpno$Z+B?GzRW~%G+r>pF#6=supur#u1 zL$unrjpmj{Xdxs}^8f1s`hV>C)n3EQM)_!p?*(G`;-seFf`BG`<{#y$?I$W#!*Orm z6AQiTutL@Yq?dM_d`eQ^6oGQ4KnBs%3}_zaREl0i;fpo3?)v8U0+gh@nz5@9EF-r< zM{tHyE$oCAJNn~%txg@*adWnWPT-8^aYi531Y3ekl~kjL+|DV(8+U`|q>O`^^DmYH z-N=4tV9~!!i?IW%8b#QEQck43S*!m9I`{Fu&I(ea0Xc)dbMB$&4t)%YXBWZO=V398 zF9&;;(~4(;)*SI#l$>iuNh9>|Bc@renz`q9pu%v6LRqX@)n`44GzcCFg$en1Cd!TSKA z#GdaoKDUoWp(_#kOlrj^5X~eytCIh1G7(p$WB<6v-~GXWDmSy#70{%PCOjnD0r5@#8!txPT>8@OC8qWbTVv zpoi+powaK-w;{oQsihS&bxAH`F-X1K7`SexSjTFBwn%`RqCU#rF`N`s|5>JNA!6wx@sKU;eJpz>B|$c)h6Ffs;@TT{T2= z>~+$WMABXW82(NAnvBNZ98LAdodx`xf7Z^wGY@f+iYdiHJMSK7-8t(<@hnAU*Cj`bRM($=IPNSPU~3yLs<5(pcH=7vI0Cr>Ai8Rz zyHB&tGRh1{1o6vhPl)RL@Yz5&%#_KJG@mJRorsT{1o1`ck^&o#vIEh!<3zAw#$xMR zn>`6I1u8`tsfSat`F zi2(qObb%-T*hpBtRg^9p{TdFTP6kW$mnvHmpSWS$PYA;1b&K_wlPt#2=UqaJD~U#Q zs+O0p(4Td6K%W;v5qZV0>9IaD?#ikvls4=0$}WyI7_D{`4j%@97+XOFe`ZJjPrMH; z)0R%wGLkESa2w3o(|EFE{?z-p%U}Mh)z6@>iJH7!g(jtKZhoRX=>?X@XWOKgOhH%- z7`=V7IzVFz=!(h}jh#icUwE!CD?1P2d#bojFNC74ChyU&yjRJh7YjCGbdcRw`u{8b z^WPquS_Y!@@}@gC3m4@im}-K{F5)QTE8kUp0w8y;v7S_v6qkis&JGU>;%j~X^a4|k zK`u@hho!{*TG(M4aP67j$XC7;%;W3KddccJ$sF83C&i0%GE;icaX&zC1%OC;^EGdP zTg}|4kEY1j#o7t0c#dqLr^=ckRt=0TPfdLHlGKvx-4b8QzpBs{GE(w!ZKSEop3;oM zn~98EhIqkFZ9*gqH#t4tzW}nn#~tB5JD}hm)XhFh*$vOfR`oY=K5lwaW}^j5%7UTM zD?|4l9~s@^JCdZ|SA*uV^v=m`tFa^=B}NU1FO@SF@zse@y&D5)OPROr~0Oo-FEc>o!hJ{RJANios#_G6eHC zU0{BmgCEB~4u*U=K4AQM_rR%M)nmg~t2ELEU^NMw7KDfI%H2Bd{!UV_m-du0ND=al z2E2cSV<=00WE(GlpMPS7qI!~A8V&7%%4xlK#J{@v9=%d8M~o5$7tjFS*?XiJngI$E zpi)=$sD$uIjWg5msy=_%K&@ava0<0pe89VT0Yakki&)QUO=e!_w6PI1>qAUYV&(27 z-!8(Ha_xH>7F^BIG0DQnEq~S8IA@0RMp}8Yl$PY2RvkGS`^-LWOfc)#y|vsOz}-~- z*<%||)WT)#3l{!#o^l~f@Bmw!9A zxuzK)p549dL#dvo^AN{tTX*cHkbw!-YW8wo=aB|c2F>hN3(HL9FIp1~YAy45l#qr81vKg|jXgwO ztn%8p9ORs0&Sv3KrqZVDyV+8eeh}8TqbVe)o|W_dunVQMHHc!(ys6ADT+;$z21f+e zf8GQLt-Wh0MYbLA(bSV7`lZ<31YY-0QdsNUjV_dLU%3P#W%-KSjN}fR`CK1(V*=GA zBg?wWrfS0s!3@v<5EMd&izC81Z~w+EM3>@e-`d=)TbUEM%-!sM$u%XZ=VZq65H^q0 z)xYZ+{JTC6zDj!5r0_$;`4Kw7a@7IZD4cGf-)$=7#*I;*gPyPVsr}NT3Y=T7kW_)1 zZI{8~N8ktC7{=-Jk|LcU< z&uiv?;Ssa4&4s^Fej7Boq2AO~yQrEe7YY!~JS;x5rf1uqteS2YatdX>K<6VV{_UXA z*Q_kXj;j;VZo50#a+hm;cSBVL5elOARzcXQ4dJ@VJ5nYw&6G;7$SBp2}3{+(^d3{*9P1ul8(C;odM3FOx-GM5gBO&|Bs1VKNfi zhn?4G19TDm;ACh{nQ4KZ;Lw2Nl?OUY{LyShhwidc@aJh7`j4E_m?m*Ufm9b6w^zaf z)m^mXWXVW{Apf0PDw0fgETeWfR6wTZd3clnay!dUv}v;Q!~qLg7Ji+L_cOV_HXmP` zqkqFWO!mMRTy(WWPEAP{@L{Wc>g;+^ski5~Z$~ecE!fho#?Qdi&XHQIrBjb_eHjJ#EJsqlJTRXr3u? zcOrS1Ifx;J!H5Od4(6aZ+~ulgp@Y<9e_1W6R!++!l=i%r)NoE^O@yf4mL=J(LKUMH zryJ4ptE5bip+mo4R#p-r(UeJKPbEujNhn{S(hpFL;SB0^>mk0!EDn=EX7^##Hsuq` zCD9__bc}$lLiY31{VzCT$UBZ)*!}R3^1lB>aKfTpY9&jljKQ@osUSQ5n8LAudfm^6gqTo(qJ;10G z5zO>!BsC=@geFQ!1e_UU&D?~{VHHb_AsR*C?Dr5;seq@-^ybujv5lyLQ$%sF@AjlW zNu9~g1nHzDgi#r({JFRDK&+vcj{uUGk?*K0q-J)WF3a!AV`k@kOqTv7Y&TK{Qb z#b5Cyop_OQvg6q6f-OQ#86u>YmBnecB6lGi1f`FKuRE-$eT^;;ND2e!-Hv><>{`wzUhN>(0$KDy)r`(`3Q9S56uIPNi0nYqP} zzx%gC6mg^ea!zDqq>7G^l+p-XEaer5FTvo7R<@e3s7f~zZVG{4%s)BqKa|OTkJ3EZ zl-ldB8V6>eNkjDBBwnQqQW7Dqz1&}Wm<3m-obDi9tfGWjwvrbN8ndMx*1ILO2zE>* zY5V&E>nC+Vb4?4R(zcNBn4Alg?OD zr=1q94=$-nIyR@B`(WG?IMV^`y1yPa3ER`YwQcA@(r6e}p+ zjhoutV`ujVl#8jrPaolQHEy3Q&W>wXM7q$*BTAo|-ss zHh#HiB3nug0&2BpJS*YZXuHbEs#STaS=t`5;PGBdCH0U3P&rRdC4xjJ+TK;%EZ*XDxEHP3uyaNDi#3;R;iGEaI%7P^|82Fp!3b-I~h zn_Do?4gj2roRq@K&2LIo#TbqY_>oNe`)<{@^y8nMXARtoopD(2d-1A}^)iCbSaen_ zo8FxeAv&>MkgwZE*X7_|DY1ZF8t2?wZF~u{vjHk~+@ZZ!9W7<1(c8P)@^d3eI4LwW zHbT*Lq4j)9b73D3@Pb%1TEgJaRmly{>NdY=Gb<~Gq-n_*GqP7r^Oqj2nnfw;G9Tco zb^V-`Iyb1Ry|^1brqQ;pEnyc{DY*wT)DuPVZQQ0|d~wgF(476Q+$jP3uExS6KK1tE z?5RrgnY!V=s!WR*Q&fQ~SK8Yeidazhryw&K3 z3oFX1$I!(!mES78uS7;LuwOVUwB+|4glYJ1@>pwOs|PD5I_k-kd`{{q8MD~bos6ae zLNIrA)8eUo8&i?le|pc~foK2gx-h7c%63)bCOu_Y4G9Ki0PL;BdN_2%l7R1NW##qS9{$&kM<3ULSPNa!YyymZ}7Cs@}-WhGo@s4!;1h@lNTO$Kihav9&XwVyCv>ZTUOjn92zkmecR$V@nviQ{ z%e0&C@SDKSx5S%8imgACAtZG^8?KY$BS8o_rI}%E4GEeLvo_F(t zWz7)=x=3Yhic?1*9C@F{D-Io{-ae2mY{6Ra9GTlLkgs9|U~1C@I{(1t`H^G)*8u)l zwL&+=nrBp_LaxMw&rIO$7hqOCo@tNiu*@>gO}zEq`m!M93F!kcJ^F+$zn<`87@HYw zmQgfR8~;W*t7k1xzsCD3_W@ABNRz&P_M)SwuHKXey+07bt#S*qxJue_(Y z*85kgM-sfemtW3o<=V+QnXvRut}tKP;=^Tm7MP!He&*?!L)p}2yp~U(roNVpwL)_l znFT;_e9j;KJB;I>@5yOAMoIWCS1myKO+^2@(^sz*82Vw)z6k+KPMcOf7}k2(aI9Qy zl?RfSyw4$7aX1DW96iQ{xp9-95-CjL-|g6NTPWP*auGYItYzFhqfw3=PziPoqoCmR z9PkcbZuE^*H3E8-x_StcGFrzhONKBn8NB921%Co*jdXj_n>xRLIs5+ZClGQy-Q);) znl_2~UFFSQn>O}s>ZqzC;p=6kavhPK(7eFr>6jq~zgrPEDgDz=;w(Vqw4NPL>OT*V z3WciPbVqlZ1#A^DFLIul+;5u>DiJ+WB^1@-1a6h%BZ%idr_Hed$}wA}yb0j~g?yZF ziImIjAA^n4spex6q$I;Z>Y<`I^p)WIppziX;HOvXdbOC+;{LAq{PFpJ=29iEV(7jw z;K2X>#ta7I?UV1EZbGeVd@SZx#M(WD(9`W3H?;4Rg+Y+?2Q=w}1O1Nk6C`hl9QTBa zii&85UliavY;~Hv#y&+4q4pum2jgA0+P{N9z~%HrqKhOySjo$;9X|}v z$o=c~vg&aWqj|(ry0-g*IY4sg;Ro&mC`H+Qd!n7Ji!xt9nPC1RkbV|S|2T&C^3>r1 zMXrg3BtrSix2b4Y_Q}b(wKQy79FSQTx)hExstTIh%^qSK{-)DA%>kv*;|dp_ICWOtj(-|lV#6O6(f|-4e8zp34&{L+esmX40|zgejKZ2+q8>hcG3Z-7Klu0(V`t{WvDMB$ywamS{AS zXWWy^r;tFnSJGPGs^<{q+!Wmmk~Ru zAiV5e^-2%E4ZppJ=9G%zgv{~}5USIphs#HjhTGr?apI2ER{I4Ta#$$Uj2`qwtZNazt>&#Z0N5$B88nVQQD* zo;sy{Vy3d*F}o-JZwcDwU+gS-YpowyYxQ!srrYLezv)0RCtma}uf*BPb5=j6GumqF zNH1&4N*W=tRKJ_E+?uZir48SVdt-T9c0g(e@GxKUyag5wJ`8^^Qe}OmCQ?4};iJ}( zs@v9kPC_#Yy9ziiFB|K3jNu*8&SW%>uV~{18}kka+8wQ>daa5=WzH;<<2e&k`40s- z3Z?AA7X}>0k{X=|sRgb}FVjrSdP+i!UG(mhZV)^^nQFPYMrjn1WN6!`us?gZAMPfd zRRGI-5H+1i9j7Ep=g5;YY^1w?pu!XpTrhyMr@obwAZhj76my0T0mx`MoAi{+d0n?} zz2{Gg1E1rt%1E%QU+GeqQHGbHFQalWE19s7*nPzTIQNgtML4=2`zqbF?=@UjhL^+M zNayBcaO4G7lBnc-1talCmcftr{w3@Q?=|whbGJ9xufMoa{*a86!#8Hf7A3yDiy+@1 z^mRPKf0Wd`n$mGji#ghM@X1N_4YFNC#zKM2AfKb-P*OXVK>g$ZV*^bG(xfRU&{%*o zk;n}OXY+*SEF`F9!d9qnv@C=Lnu4294My57*XvAG3xcv?hj)wLZ7xno(~rZzN~M)@ z*-Od$v^VIkU2G(~M(+S6XIx+{Sf8jHbHBq(e~0pnvTMwW%nd;($7?hXPJd;qe9slG zk7v$M-hp;V>8ql}O7gW_?X=#H&em_-o`s8@-*Q9KuFrE%5uRN1BJ(+l#v8EWxYV0M zs=pvyE*LbEFp@xB&3QC2rvhU`yDYVxC2@b9=F*bme{XJF& zdD4Z-(r&JNxWsHY)hP|%KGOUYElXJh!B z5<(=OT}Tq)&kF&0pYNDj(G}PAVsVbWX87Ti`KqI5#ISH2;DxuiivH|{t2^k(&q(%J zB33BR){GP$%Y}Md)tpp*^T4N%%SR6)WvYTydMpv}I}W78kBAP5I}iKvwh(@9k=mxD zYMeLZGKS<`Pt^$@oL@p}Z7;Sau}BW-=ewIL<&FY!_SG-S`)4KoUvWevRb8@>gS!;W zTbwK|x;#!KSha!Dpsm5(^>o@g<6~5 zJikWw2FAZKt0}?1!rS1d@-m2Q0JNJ2k%XiSOQLd=Q2wG?pFf^|xRe3#$*%<*F_dSIbG>E9xn3tBW1&70 zle5gbSr@neg=T>`fgYyO$?t4a8SgtO`3NB{bnGJw&Q^4UvcGhB$9#(4=p_til;2M{ zc3zv>mAm?3N7y^Pk@llOKmf$IeYx`8Zq_UY?*%D6@El?jPtJRd^$8@-y#k)0#Y$mA zEL{&-2^+j{@Cq=}U#hif(9-*mZ5P;I zuZn6JBnu*Ys4jc(&d_i{6J4kw1PQte4c=aCj9_6Lnp$+HP|6-WR_J&CD-`&z_c&ei z!3S63Zu%8l#bzEVMxkw;xWZ;$E)1^7$hYgP^9tlf1S(IloTc`h%>2ffn=VS+~^ zPyDxzDWUhZV||eqrYgZ~oA|v!ZOuq!?&|UT61#qp-o znZ^(q*_;WzVw28QeyO)wmBZ#V6sE-GPhYJek0lQks*$ESSP*gD#-FYflxb);I zJ)dM}L3FKB<*7LSyr2tbNIn$14tGfyP)3WYyN#rw@2+9!7WVf)Y&<&FZ z4jpIf&@q9mnPjqYFQ`3XIXl9EtW6%G>svbAn&V``MQlNmCD>JhJ&3u4tvvlX{9?(r zX}hH&>h2FZpFnN1OquaCrv)k=xEP^lgQ3;*ats6+YB-=Gm6g5`XdGX zk38_Ceo4bVsIqWlx$f> z>c6+^t7BfK?qDR3ewS~bnT{e%U_z+$RS~yjl)!TH37VaN7Eg^)-XDwGwLNZbFyO?o z>5cQ&qVKBHrz$xJm`&OeIoWu}H+JcS&=LqhSeZD6AxWDGH2{@;y40SOZ}fJkbRt$H zIBx}Ku&^2Blg7m&>e1#Yz2r^NSgxp9qtAMU{tc?9O|0|CuCBTD%Aa2~^{Q z@l+XDpIX}^>q+CEPS)2g^S>)*etiC?amCR6=+R%Yt!&`a{!=gPldGlV0mwwQzKmnx`dD`fG*%!}Y0_H+^qY^S4%O)#HNI_R!)*Kew1v2UI) z*=14ta4JisSIReNS<}w6T>H8e>~^8}AiNw?SY|kH*KjrY?c>X`4|L(gS}c-ur}9!Z zd^pt2{Sw)Q0uh~DfH?Xu2pAl@RV*evin;&Cz!JalGX4X-JmOKjde6LPx=%jC)8Za?QN62{ zag;nxS>^t2!~TPc9l<4BpZqXyJ5^DRz{_ZPRE22pUSrFVpeM_Flito2l|TS!3ermR z5hbVoM>Q0%1ERSNQjdi}pbzUin^e(}L~f`3vivvdi_*); z9|A0NVKv6;<=T+%_XOihnNB_^Lbir(E>mLyE>>Oc-Xs(+@;0u$*RO zd%G^I6&Ux3Avzg>)t#Q5asLPhVASM+@-BJGfkdgUj3%AXyroLZl>Kc2%@gD&2aYRe z5_xITA{O*(w@I49hDZ*3h@HNyHUm5=jDp~q4`5v>a#vjlF#gvU7wH+r8#A>cE_oX+ zu@r@KI?e*jUsohKtEmYyr8T#&*mJJCl3-W`n@bh&2>fN|_D@ISM>z7&RuRiFf91A} z(jzkWo^zLrgjosQlJd2Zff6arZ`L+pGN%!?tWUJ(JrS*4R$t;ZAh_ODX^Q#gi{PWG z1{mO2d`1;ONX;K?aJ5S+gxZeT43CB1H!f6Kny!U?Cj}SmbS^G-Q8jO@uubnvQT|y3 zD)Z;z6*hp_xMx+DT#-2piAkis%Zfe349Dgz7|ctYI3;0TZ-dclXNrj(nw&Dz|6CFm z!5H~F-T1Zg5MVWbt^-SsgDFYdaa{I<+&K40eKYIg+?VS8=j6m+If~NZB}G$3vaAD^ z_gEsTaXsa9M)FH9N1aE8fXZWWnPiNe1n{T0xrPDc+y*?LxqYE%&d(-z20N9*uMvr_ zcGAXVO^w@HFwfMpEhBRaPQ495@;{+fl2y`51T^8-D|;~To}8R>;nwMTG8M?O{h3_= zq=OwNF#@@(w_h*b9UtB=9*%qy6!6n=kyCrJWIR-yM4v3&4z=X^n8D+oEZ@2fK#Q=?W?UAXVLt{_$j%hk zI}tjKf=J&v0|t*R0eykD1Y3BaJo&xOCNks9_faMOG@IVrc^2NJt|o*go0089yl7~P zlCHt@{r26R>IdFUi|F>cPoQhlfYXj-r9Hd2dyBO~1RW7V*pX?;jEA9o3h38F4`*Z4 zDDTmBRj8ub;B7(EM6Vd5^ci|N!h6b()b~9bsXcPFkjc$S(GZpHZ8C0FTeb>29wR~3 z!@jp7Fl8A&mx8jtVf+oVL-E2PYw{8{JFFjr)b44A6J9-NF3GymM%ZUMwVLh z<3~?6A*z~ME$M1?w3=Yr$P7(V{TVpRhTv^^7^06b*oRt1Y!@IT!Yov zE)wpLC3kKh5iXtE?riq*VTW>YvRLR73Xb%JG|`Kl^>0f?c7zl-S}Z21=b!7o_txS( zA_=LL(3pg}GBQ4xbQ=(zi+mI8AK8(q7IDuD2Q!_K@p3evF(4fH|1jHC&*p252=3jskR zPU@KYD91T5d<-O8oC%tO^q1erOn>m|tw!N1on9mzFqiV5vdzUK)V9N5E zb&%cOM@#ljzGw;H9$N+caF?LF|5$+-_OjKn@^7T41bK`<*1aw`U=4T1y&YOcWu%@k zpfuSE?-q` z;Nsr|CBAk83~jyoHB#@l+?l8DH}Pe5vJxI$*WIMe)*n5*S8>z$mdP=9eq<-^zI(&L z$ll}nn{JOeIQT7nXr~@}b{%A-WGzb)^+2y+U@(88>Fsi%Y&f^4ry(0~&w~8aog9(&IrKX1IJM`2IzL3! zXtFoG{5o6$yjboC?h(ofp1PVE=8|z%@?xL_jvt{I^WpaqVBKEdRXX<$xNy1C<!vS$0YP3Z`3u& zJXcwT2?8ZY8-~koKZh)|WIjtimeo++(mu;eIAl+-kc>D;ZkY`Ki!M#euMM~0FSZ<( z-dQ;GprjIA(|QCAf^kF>Sa-2z#C{X=_#FcHYwrQJX(FwXnmJ`^6i#O+Hu1BUfN@%! ztlqh*=n3IC^Cy&07_A-(z4GysmdZv9(icyWQrmqzC+x$325(uquf6M*^pn~ye=H1l zp>`?rx60F%oUo}s`Bb2PKcmd&`@Nt zM{0URd4EC+XRGrW!klm^ME-JrtMO_xEpdZtH(cJcN6#L>dIP)AO|s?Jr1ke90!Xqb zjr0{tcp_=iVir7FJY900tk`%0?$e*{NFdJ4q}SBUj&)c)15vQG{_xt0hb*l$BrPV( zl0OD;(`!0`KmW1{`|;WD8@qJ6x)-~-aKbChk+AV%wvt=PO1B|-e}s#FB_K=qEOpGc zPu+X20}1DoR9tAPW2wV-SK~=wsU+|AHsjt+TkME1*MTvl73gU+TexzOHA&D>ZBtjv z_%}r!R_7oZOvEyId1S6Gi!b-iTykj2$@O?7TyYRA@cA#yM5wL6cr-XT_Vd;0Km8r* zct!B7lW;Rz@@-a)n)XFi+AP_KB&kjMTJ7V4O2N>mK>E(!o$7jF z3Il*$?d##%pTA`q%|0wR;DZPeXTxf|Av{EIleSixd+WWYan7)$clPK|`*o6yB1s6P9-Z|Xq<3IWYTAz3%5nQ;# z3frIm1cDK6q<>VW-Rb~(+|bVGdd6>4ZLF%NJ@*WrLA!ne&D5bx55bM7Eg%yawAXd- zZ@p>i!r3;lS0PbKy_#;16n}$$tf!>@$m4wH=yEItyn97fb#s~==j!Av`b;AVx3b7y z_nn^HyQh&$9<=8S4An_;8&IqpZfG??rd4z#eks|$=QaIn@{nYC^JjVhhxm3L4_Cd; z!_`oQV!qU3VP>4d+}!W-89oi+8e19JJJ=cPTOPl%HZaHK;KF6cJ$@k|fO}2F(ZIpg z#u)dSw57hu@xjW$*v`iKzP^Jou7jN;a7NwC$iWnsn~NLwnv}7biKzoF4?FHPaqIim zb}BadhQJlKjGfF3jg=+EaIcA(IoK;2+lgCS+E`l|Tm3)w-U2L+W^2?Pg1h_R5Zv7@ zc!1y*Ab4fr-19s%&oI-~HC0_T z-PLQYTJEgN1{KheJTZKte)9M8ZKs zN5R0yAt1oV!NVgWp&=t8rUK&Oku#A~y`-gQpeH0_W@DyfqoJdx`&|eWJUl!S0unY7 z5;h$X9ueLDZuDr202eys6QLv?+xk&G~|5Y5fG7( zAsK2g0571Rp9vJj{wDm!phM^C6Z zjh*}vk?`;d2#IK3($dj0aB=hS^6?8uzLJubk(HBIQ`gYc($>*6F*P%{cyDRt?BeR? z?&0Yb5EvBvDI_#3HttJ&LgLq?Kht=bar+3^!D`+jEzrB zPEF6u&aJMkZ)|RD@9gfKo}FJ@UR~eZ-u-SD6af0KWta z3+jbCE29BN3BT*?~yKsd|EI>@J1gwlQaDm=>q3I#{b-{bfDWE~ zK(AAoaMzt?!7N2@T8@mx_9g|mXn%FmR0p{Y2Xr5;pL0QnJ=xyeiwDEpU8N0+jd2g~ z?*yO>dbHvn?62dw!obR+#1U7d2kzj=Uc;OCl0ByWcUnqy3|clG7UUn%-bSLo#YK|# zt$JHCRadmw{!4$)ef-DGBCrl^z`=pvYD22J1ZOQjm$~7 z#g<6sNAYsWnj6Q}0&?w*s)W@ZXQrxp^X=9#__`M@q3aF=_y?=RHM54Npe+X(^sufn zOW}J+pyst-`*%a^)Nl6Y0v#!X)b$df#8v>D?Pa~4%Q&u#Iuh0@r57j*jRldffG+Jw zVub5}Sg&bo-;?RiL`<0AC2wiew3DH(nBDUu0x)N{XMR{Se9?Q=;2@&A;UrA+Q2vDl zSo&D;5t5F+ZCV^j?8J3QTynbE+hWl;!!4}rupjsRi=I~AZb2nF`^h%$jXo)B9}t8% zcwLQVq8{t}$ftrFGf=@aw4a!A(yyd>uNj2;;vZ$(cj|$hUg?YQgtS%V3EdK@3kC%}15W;~xW88(_!ays6ORV7(~$xy_M7HFMg@C&>H?rCR*~#T9j@E=X1yl>WnRq_ zz(5VOQ=7}2giDe+ci<@sW{z-w^yM14zovZx;2U&gE%t-HSzgD%>ew-_$NxY9=!o-O zZM7Pd=lRh5F5xqzgg215vo&R}x#i?q%{^{4DNA+w)#ER?-+v1`5U%)p65s{#eu0fm z8Ou?|V4Xitf=q{?hO}J!#)+U7)e`_&vIDXXmf#7{Z`pPeHg{SF4o!XnJOmtxUg&A{ zU-48K8qCk><|jD{ea5kj=H&!J#kFr-=f!u;Uetk^(czx}zf7J0GeHHN8-Q}iE+ z=Ui{BHFrAPvH9NP6G9*J0n)uJxK}eyiop;yL<|H@`t$_&T%_Cfu~|7shkNlVj$=O0 zz1c4I5P&6zAO`?=3L&|PJMVeWM6MBR`M*ja#1Rl+Gc%(i_ZJ9IF8|VuzmBT>Bk^?5 zf^pDXJE>%GB$1r2zt?=l=;H`UkN91H)Wl&;+RJg3@3vFd?4kJ4vLhON)TTgxPDr?1 zs}85b|A~nic>I6vs6)oJy?f~tO)SXs^$)WP9p zkH}a`!VTp%H~f|~Z2^I0>ZQY-17C$cvZdZ{qOXSCzqVO|f02bs`WXy~ny_6ldiRrN zK+nz{M@7TcdgS1N;p*BV>FT|KZBox+krL;8u8*nl#+b}6UNbk{Z_W9Sc-If&qG$F` zfQsz@@VQ*!o|q7v0a+u>8Fw#<4qgs;0#HpBl+!-}f?A#cZ7X5sC%1IXZHj!H6aG+4%F&}X2GlG*d#)(A&%X3x{268n)0E&eJagUmd`n! z*(wu=mIqntThl9wv+WrJTL}_^C+v zr#0Fj^CT)o9cV#V*wq*<+KG4ox^uV7RHEi}!me=Y>07bed1@sRJ$QH+#|ffayJLyT zgTp27VTQ*gn4VplIu{K{AQUPR*}V6SEsvo-tXxfrb@fPEP9MeL3&=MxFG<9Ni|Mwo zKi9~n#d{b>$Qq)m847aW=q`s|n&JNR!0sCL$d-)UW~Vs-(F`Fd_bsffoCiZwl{?0Y z7vDXWBx@AdHf_gIatrhADOdD80feS)ZS)y%3@Ed!HS0u_w5=*icDRpet%gM%Tz!duwMp=oDAJ2Ok*waNI57rK=X53jQ{b?}rY z#)VmUG>GUXtl2QMSkQ=`(Q~n_qxb`DF|a6AV{h<6_~qc6>K4G3dI4;7v)*Sd=iYi_64pPUP&@Wu@=QM^|RSOEe2di-5=c z3KF?;xYN4uV<8#0)2_4m-GEI6hnXb~wC!G)4d}>#I;RikP>B;g3W}6rZDCa% zO^*5kcUT`RcG8mD*~wClEwKKQZb@pxB3Og`5w4lH`;l?NN73g@U-op)4vtaEd9w%3 z!Ev#H->&(J0Gg*$X5tdzbuE$W;e>c?qCcD~d>IG4fy}m!GFzNmUex#rm{W;uyZh(J z_wRMX3wH`qr%Inn*%WzI>k?GEHDA(D8^bAKR&<%v6Rg}3&6UTN9ojJ+ks|le)TOaE z^1sPuk(L(U&cZ!_%po?rzbqJipWG?93@d6;FO(!k2=Vqk1RyRhpn2ZUt)4cYZ6sIm zbBd68czsw#XL(^LjEE9c6YE3wD3hj#;?_(sI#qmIHKzU^54;$Nn?YGYLAp4e zebk{oQT)+Q)!6MzDNPuP(1fw#g%S&~Hb;WRPZ>SRyQ=n}NCnMQP@C-|1_>^EofLR| z?0RHhw6v`peVv2DK*`iezNJDVXBYqp1{z~(f3<)X+Nk>t{;@l%&5lJ@IL zV&T}i3M*#&nTW>)~IgQ~-UEb54UzMo%@ zw6*AuR;X#P0*X5iULi?b6(Fh~IR<&8YxBm45d{S$mmHLD>#7EGU zc`DoFF$kEhQy(Vf%Fl=T(H-C`OCq-CsE_``o^&|VpfhJrdF%aoAFj>}`u8fbX2g}3 zO4@FlpZHOcPBW0V=qXw%#ZMy zS>z2D_WvZ7FcyzrA~Vb5r6@JmH3jJzdC=(K@Idv5@ArZCw?CH$8qfz1(65J$t!ee_ zdu&6k_x1PNu#)hZjJ%GtjvT*4Rqnh?Q7{wk4mx12Y#(5w@T1%F3iper0y@#pu=obN zPS-wMjkVm6&5BTHYJ$~x!y!kTi5s+u<`Q(r9I?Dd`jR@`Z*@r8e0uZaeh92p?Ge$^Q#Wa~LA zrxN0zSDUUZ7T}Ha;Vx0!n6bJ#6_U@)HeQSp91fdJbl=tF5gb<7B zxT6Jq-h}%3dJK_Mcq*6|{C8{Rp+4B80}cDaFf3#EZJ#5WL?TI zx|u9}HvAKsrq+uSZQ}xHJ7Ulg@4t0bzJ^LAy}+#~KQ=kV$(l%=$#39^c%729gC^VU zynIqEMD_#_j{+w^WQ&mUl!H?H%V8bzTDfv=(W9)1_L+ z%0~~8_$)&*ahCkmMRWc)Z!PcQi{YStWoJ_(y<4QpDWDi*JK|JM!dtRD}SrqOy zHO3WsuX2*8-A!WQm+VuZf4Kltn_x%2S==P8BACb0PGNj!S5-ySmxBkgsUlp%!!5cikQ`zzL6IaouTA<;hCd|ZKuYihnL_B!|O~{595*XYPQx@ ztCkGVI1v5_Pb;2`qWXxq;Wf?s&tmZH+}O0%sYr<`Ow&XSqGLlS2M^yHohUI8l~@P9 zvLfZcRz^L9GQ6q9Ew?#a%~hW+;F;Vld;3n;8Q=b=cMTUjDaDp^`AH8+AWP59bS@EIeL%IG$b4CP41Afj2b6_s zOVk6@&4nK&2&mz~Dycz2>%twtbs8>w5$*B{*EH+bqekbh*8AR43`5_vpi=4j=UTMS zAXVM>9v%?xt;G8Ozs~w3T!2wQeEe0NdebkWRVG+2aFKHY`7e8KyhEiuRdgdR3d7kJ=c58A5l%5 zWM@{e|8>VwXq^yixYHPvN~5Xw*^^SW7l5Y<=i5-|azF1YJwV($8la@-{6$=NNss=l9behE<-mwO>Kmlc?m>`>>os_drhH-ui zzM))8pqRnaS?wm-@WT@zYb0(g=S+8DRjKpw?8aJG+6dxU0?mN;-EePX}>)uz1IIR{2k-9jX5Qqo@gwycSv2 zAihJ!Fg^uEgK(b#sMmOi**%^1;y2H{1max*lW5 zvMJa2m5uT_sXj$-ffj_Yj&bP(chjzGpR-*{-vZMr$>_xaCM2M@kFMLIh@g$kUBjzs zh@2(t-Z=K%8b52I@H}-T+(C}~QR6gKO4S+~fjPBJa&gGA+Z;GG9rMBVL-0s|OZ`z! z!rm5lEY2;iwkh6`{$HOj&LC2S0O?Oc9 zzUeWAjQ*>VYB>!{FnU;ZL0O4uLxhP!h+j+wu+guNc8BlIdm0fX3Zbj2PbyJPDGTg>64vfah1(~*|mXdUq@%JtGl zXoHoQ!S@;P5n&1ZKlJpe7F_NbeFX-H73egHJrb=bFQD(zH!fcm?>7pkH{|u3709&G zZeVs&xtR%vKu}0Pf2sE>hHOQ03TeXL>EmJ6@ulbqm&9^R+{h{XVb!%~)vwhzuyO*T z>=g5gtHzc77tKeFy_ys;bA#=#Wxl83pyQ3}Eh_;Q_kelm(%<&2EZLU?ppPuDqBmtGDd5k|L zWrDe}0|2^^`!fHZvKgN!gCo7*9|D;n<~-v`N*&bWPc_BF7WFM-+{E54J-g<$e_1u4 zV_{V*Ym)D-mc}N9217TrvtRo)d|g3Fho^1v4fVA~y^Qb99rH{$?-`X%qH?O}AOuFD zTit5)ejrZy(Mpw}oVfMz@oM%Ojun%do)0=5K?qH}mw=1bk;_Y_b2)Ha(wj7Et#5Uf zY?vhE-ThbWp+KJ;2ee;G&La}&CZ~y7wzTbu{*?RN`^a2oZ^TezXGe^~)~QlsE^%5> zZ#tSKvw1QK zvBJb857kgd0RTP#%E?byKV_Xtw0eC=F%9+=REJn6v$H#}8R-eg{lNhS*hRu6$!Y7mqe7aBoYj~O zgVbh`zdVKfBDePqAj&5a?_(YrI2NMTegXZ6cmfncOrg|m_hv-ss-i@T`W^u*A3_Vh zxV?WFz987&5TbZ;Ons)nGrj;`JW=$#{^+AY8_3-=K~!FKCX+b9+i;fx)5Kj_C zco&r;M7uh3!HT4}pk40?wkfQfW=XT zs@wq%cJH`QWJZD22eOGdJ^CXiRwLU;I2At>L>aXSQP=Jbhr(Wj z+te6|hzuQ_&Ea_bwRbi7Bm+@@H$jB*d*igN!Ld^4XE`J~<890N4sy&UOY&6A{jnDO zHKnR20HSgy39_E2P_duKyYx9iX9C~y*ue=PIRiSmtk*VoH-w{?4}M8BF-2x`PnKvm zew(h9l*&=Y9`kV2I>8+1wGC#;Kzjm=B$Pe@K4e2?dE;TPyCG8Vvd&vweapM*Ket7? zF6saTLoXN8aj27Vk`3B}7`FRaA7}sLb)u4aDP-nM;!Zy*P^!&6aPFzd5^EWBAMi` zW(t_HLDUjuM><%Tq5xwQxZQYj^6luaLT@fnBly9tD{XL#R*b8*!bfFg1YCKG1M^jk zZ0+W>0JD&pTqSArT>BdX!%J4s#_l%@+1^KUvU~R@z+ANpT0ZV+;-kD0gwbjPx{%k(@_{?)uJ%&!@ZVc@Js1;m z8)JuIlH6pxOx4ilNxSN-O+@{NS{5#(8YSra& zkK}uxxkv6PR%nTxX;h#+;=p&>z7jMgry?{j;Pe#kX-}cq%ap<;?Vo+)xdZZ>zsmnJbrRcl}&`NiBN!rYfi(brX>y!%8d+}2C#>;qNsxyWBRD0#gClR z*~?d+00_GV*tH4F^ey&eTzxT#-ew< zTh3}{4BC548GFD@p1N=GA z-#92!#TW$ZCn>`-3LD>jDA?>?Sr5_DmAyUg6k(hc;v3?+Rc}9RSc!M10-A_qUhGIa zaHeWyT7vLwiI}V!T}}sZKUfki7t1@Nuto%WX;L+JoRiGbkjhD}VtwmzCwB~yC?nPr^;*s=2leLti$-J(aWCwoQ`_FsJ znZWe+`|H_Eo(wZKsVn_-t+)!C*>KzWae{Rm-Fb1#7af);ZzPiDfwH*LN=ktB!b0oq zDioj7s-`hUQ?L5h9Gxi7R$K|6VS0E zP5>68>Z>UaBHB#B8#k?+FkW~OWQ6mzmpWKbYy`+IEPQ&7%QpF5IX$6w$HQhG{_C$R zf_Zw^*3jGh@ss<=@5k$K(X~f57x%Xo1aq+m5z0cF0qj7F%5sWRxEk;Bz1ZbF)#Ica zBtGrw?*#|&H*v9YrOP*(kBm=%Os&V)Qcr+xE$dbLz(UyGv^BwyG8HG-nud3!?H#Ng zmn-N>USX<{v>3Oh)%Owf>I2`a_f%tTuY5+v2Tur36gppjrVU{kMTmK=nd&drk9N}w zACaB9?oL427)5MZQdJ6;m*+#KC;K4-$S7Dpxh`uZ$o9tx!;xK!koZRaqW}m1?9l%# z+UeUP)P4sE0CrK_Yo>nsRVFyDPW>Lja?Q^=o2-(LceKZPVBVa++w%-gEKtl_^t`%P zz-5p`@fYpGoud7G5JvC8B54x+A{u>~+fd~s#9}#r2nTz}X&bs+LKF>sTYLE=(yHxz zA2Lsl4Ypp}krWtUmHtBf`Jf?jVEzNV%@R89asTYyOrqiR=0tBC)>m7<{nQxho>)o8O1DO&scySj6rz#~&L>sdUUqat@vgY0sk!>PNZ$sU!?@hcBsh&s^xZ zGLsk2OgYxr>4Ssd#>0^^E%|BG^!L=Ce69!9KCI>TH}HKX6?Wyv^hfV@$Z)e1XrlD1 z(~dC3&P)AcD7L8!sVsNL$x|$& ziK-k}Wwjk@n!5&zO2*BZn6{*4EZ(J}dRqjyZG;a)rq;DUKjlvkbqSbj^k$eg&TYRX zjnVVME%}AOQw79CN_i{$+fujPs3thXL+0m?j3u@Kc%FcfI<9)Zb0f-W&9;{oA3tBk zMa4a!BbGZ%fkjzE9`SZHYf~3Ht?h*vwCUV1KSKJlWM=m#huE+a*-7E$^6mY&=WSa134g75$xscVHQjnGCswXLv#DRz1qqkGRX>H+su z?75O@jR*UD+E zhc8@O%db;Df;#`hXT8=#$d4$a58+9b_=taAx4*@PNC)@u3lN`+@~_R`=r~#kzt%vy* zUY%f?Ev<1$>I+Q6=vjVfw*|*oPp&xVafo@I`($LMb@HJp-_yV_rEWz3rpK}B*y178 zLdb;se7l@3!RQ@)?YOjY~8B;K69SPNq=~;vO>~S^%G%bd=&5mfW9$U*SldaoV%sp zURV^KjCoh06HUHOyGfm$r^JRREO6@BT##GS-FB=tn+nVtT5u8#a_wYvJ}y*#^`?G3 zXmH3upKk2e5{V=k3clXSCeH za3;szgb*yw`uba0(lF}KO}+VO0W}B{95T1i zm=x=+=S_oLFDLvXo|Y==&tM@iYD$!pj(=(qYU?9LEZ^+OQiEJNVs~?vpCCgxkjjw2 zM;wemf>2hx{xhkd^yqVuArtLTm7ZlXU>?tmX$XqyqdQ3hKZSCxmyF*BQcDAzS%^+U z)|j~Ud~@C@pda^Eu0;2(359V@ zytx}6H2^2of}p?sCx?_LT+@lW;diIGw+VNtg8l`X82#aUnW$+|1f3kHTL$mvgPD?#}DG^l0DRK9hN84L`M&eGh7q@P|s<0%sX}(%OMG^MC zgXil%ZnGe}{R%orM>hhrJKWpzDpm}`~p(TJlFF)k8d;{*&s5j=;P~7h|D@Z`dM|l8$ASd zZ990Z2R}eQDA3OgdMNfaW5k6mjA_9eniPwQxT2542m}nD|DRMrt*Oh^R>~Wh zaw`M!iQ~+su_@x>LJP4oSpr6WDn6y=Uw-N0QeGf1y*(yql~QUTA_5g7TdG0Uz6wIUT#D>rP9&-pzwSm`+pQA0EBbD>31Y6Hv#d zH_gIayP9%_CWi(cT78haq=+|5!7WJ8aSqvLI8MYh?9?3=RK=LMI|FvGsZX3FKoG|=a}t#PR`5R zy^;2@r(m7q5=4?wysA5rEm-B|+ z3N`DNqAx+4^vzUKg_mnG!|zxbmi8O^r6FI;&>0(+rgwcJ7e!XH62{7VoMdqFK{(6i zW9D|U(LN}SH}$|1_bOf)X6USY=g~g#K_I6W>3Wz!G|&RGYp-V2>F$sk@$hK>yzq16 zZi?#%m}Q0NVcl~S;bpu-7{c$PmVEcP+cs~{q_LNdY5NY{9*0Y1ImNw1z}i1A;#ObJ zuD8lfgF2KkHr#x(iB4^*u>!AIs|I0!Z>T}n5?};zyTM~KSw`{qyx&vRWB9rqm8VXu zZwURt#=%@zUDy^dY8V{Tb?d!Vh#cPjVt14K_9UOGF$1SE3MqM8h(*&CquW6w4r`!C z-8=iwJpMzr;Ck-^%P?U?$Z6gmv$^pa4zB2rzQR}@h`R1{$Ic;6cq+6Y8kNA{=xhN; z>neBFLDG!j29JS3)fh0u(YK)Oz++n3#2yNJf`bznE{s4fa_-9OE_95pE+BZ?$wQ3` zqw4`A_?3)8c0khix=jXJoJ9=dTvA>bQ&b$|sSa^Wv4G;Mnoiq%h`q;Z{(815>Hd^rQ|w3j#;IFfpZ{@DRR? z(L-p-G%LOqrdbMT(qU)G2%6WsYL9rD|N24!hA=P5^c`9}=qve9U@;MoglijfoW`po zC90Hnu*z(`OJ!q3l9C_D2a~>R@x`Rgd^TxR^(QdKP9S5#V@Y-zX49!2T|>61(*d_n z?#8C!Q`nKxV;1=Olk1@2J7<1~Am0Y-+ewgOle=nBdfnqCIpb~hN~#M9v}4smO)oHs z7z*7IA&z;o@W^P853^pfCFQ2P)J>T%#Oc@jwHJvc=F8m+GwMWVBWA40rQ%o69oUu{ z6+u#c!r1Xm#aZ|BP=sy(Ocsq0t#CD+304~brhnbfn8hj&{l^#o(K7;z#g!(IenOGf zrmCjjoDAYO?Z!gYnoNo4Y8BW=`k!5n{iyoi+fIL;1hPVs(*8-Rqjl z8ibP5UVY$ms9T@X(Kk}yb(+KsxHQY*C_Itm$oYY;<1d3b;TOMDDo&W*S`qhhAR7E3 zVx;Dm{(IAz0**KFnN(WyGU>SXCJF6MBB6mEdX>ON?{~>oB%`+b1<~Y1>kQ^OnkD6@ z0rx&7@>at8Dvhz^ElX`1yT~|*FGSo<;ejh}m5bxN6?z__6>K9G7Cx|}A-BtGF>Twz z8TY^4Ol`29li`Do)W+`=hLvZs7Qj|px0`Wm^dk4_YN6=~zcuAi{6&6^>;KCBJ5!m3 zs25&`d_sX|r}ruCHtPW`P;e9>eqP8dGyy@PH-B)W#MLECD_t2M!GC>Gy*TUF7&!r!>@8W9d?R=W4cqWuYH^*Kwc7W668g9jd=XevOZXik6m%lQ;Nj-qgkn*&%kTs$Yo8!6Dgg; z`rAwfxv+5#LV=X0V)4!;UgvGs_@px*;?ogctHX*%YgGd(8QqFvNH9zh5vFQsn3d$u z7**H#?3JzT_IKz?6FMfuYAx|^@A=lFU-9+I8J_m(^qF8SD_^)%BWTgkueFLr6XRou zP-S=T-~vq$<0}E{#w12*jIu>Am20-<-6ce`p)AbsbU;D`Xn>gU)0VlN`wv9#xSTgU z4Dt!tpYD7RS=b2KlwN7llA7-8RpK^BQkU9P35@QCr}x{KH!#k~WA{p*dB47;Ob-}> zoyVvXHc_AurfWi0!XRKjX-~kEd8x_yU62x?#DSXVMkt68lUYIPRr&C33ihcjB9${$ z{i_=WRYgP>-H5asrk7fq$sACu1$yiU>*V?XJ6M9L{ks$?z>{Fnm*}5%D@wLT-qb zSJJwy$CeCp<26}MY4k;c;Fq`&smgtAZnBmBFEr3q_Ss#1%cbOwyI))Vk;Hc}*yq^j z>2r8IxfYUbY=FRRGq5Mu8gmwP*P7(csN6u#^2PzglF75@y?gR6vT z%E`gxI?UWoIv+#&Qk@!7?mgN}>KDh^Z<@Ow;;_WcHMKFR-zRE`(|HQaip?R!Ar^idtvr^^F5@Q!SDT|x>Mi03jlc5KR*Ko+ z6qm;EnEAp!%d_*gbx3^3cq}oSL8Dvq^44v2lUg9W;e=US_WP{)!Gk{75 zJa~uG@T@Ol;^3_;EY}(B7!!;JPVABFrDhe>-D?v1_iFtjZ|?cTr_8zAEDwztrFHCr z-x?<7X;ma8miDqTp=47ec_tokaw5245-_p-A}tU8+REs;F||q#S(M87Vy})yL*Bl> zU6F($WRjY2-26;O8^8*?fjDX~y^JHOw`YA!qzr74F zUYka)Uv9hAiy5tAVD?V;ixe7y6JLc5Il>jYqD8iFhpeul@?$ zczA?w-7~q}Tju`Jk7gnnB{j5Ufqm#0fd!6S8)otujmLY&?};_kg&b^|9wOhlF_&mv z8LJLVe19Jxa33v>;ZA_}y3r_$u~dyYEs{i*qRBNAJy}j9lZ-DO^e?I)+zpsffu#4~g|9ma72CSD#8{pK45|;YSt*tl9-uMASgW_| z&2+H%ZCA4lxngirOni@7fRwEt)T0{I&^7jT(aJ2xxGjaJGtt;E0)%6y zwVDb^e5e}QsBZ_hFvXO=;D*(rVFjX`DxfODP$>3wNYR@*w9+CbSlgn{+r!lsP?01= z<}b%X%2Ht#m5pZU{fk2>;0 zH_XVaYkZwWilO9eJcVJoM{ksdm_|^(b$w!6L^BDT)kk`9#qS70Y z7IYFsoy^8M%ABaaVZ#U}T^NmVTeHSFs(s25xWiktqM88E#>#QO(^p}1!P+6ti&bw) zm9c{3-Qvck7X0MoN3|3Xhq=k6pxR6jI;4q7;ryNQV+4+{GSV=GzDclijgKYA2{p4j zl16a)$IB$DuY&$y;oJ;O>MW6vu}@|8lq{TX_%AZGJiKCpG%gAh+#=|x(m$Jo3<=jGlt{tgZ&7B@w%P`?rS(okX zjRbSVaZc_g<8JwRzw);A`ADvqGRye3|>b zBt*~Tqbjn{GC`_nDNAm+Z1g;(kOh|ejBuc=&4yGxBO1NTQM_<2rTCWstqV#7g$J{$ zC|v(9k;2X!GFN1LUb~F3#ydp^iJUN2WV+f9*)KQE+9}1^it460v@gantH~uZEnaT= z)l+w=Dte8H+@_<596VJ7;!Jvya%xRhG1**|N@Ms&B?kcUpoJVfntnuwDv&}yFnkk-CFdta94K|mr*#e;;U-vREuZ1A1 z84%u0N)ti^9a==F@mX~|duh&i;!KS;6Di5O9)sr1eSOeg#!|5%g&KwIR6kCb+t$nX z9BWuOm#pDu0tM>A%e6XM!_b!=spJ(u_%B z6rXq#LH_I4I+DW&d*!9tEsi9#TU_;|@Di{2MH6(4SHl4~Lv0wQi?Ux|Q5eh`W~v&^ ze9>r|W;RobT3}5aXf37~&Gmk-?c@#I$-g0@YRyjUT#UyokN4r>yWEuZ_rSU#TWu^b zk{f#cz#!}vn#LmeKtx3V)aad3Up{k)7fRi0MH5=9!Y6ka@n}R(qvb1eM&Z0Wf51MD zvbP2%B8CnXj01RoQFIbl%;TF^J6{m9=x0=)enHtBHK&!u-kFZPh<2# z(_^sk^!i4!pPSu8+AZ_}bz@)>p6pGI;((!7-6K4DrYJr2$H{Jxot@|niS~8WHrV1T z(?h)kSu^%p`~2WC#hN6l@8e1oAV99f0~(v>`bCY12mXq*>uPM>`Z*~X*oscvoJ21- z!ytDS-X4m`T(q~FL}7YlSYh&hW%HfUl{Fj8a`jpH)Nuero$}E{LAhT!&ct&5x~zKF zLH3IM=HMjo79Ip0w8F`}a~dFu$b3_RadFeJ9Ef5k*!cMoBNqq+!0va}-@Dq%##Uw~e|_^0h=2o${kIhZf(!Vw z%lQ|yK=n^%DGwJJHJ9Ijg5MR1+c>;8VnbqQ|10~SU3MgPjz7D9s&cY7Q8RNv(u36c zN*ak(-OR(~?_wdqf&rt zlkm(*_#Xivo;eB6oP=ji!ZRn~KLaJ6ISJ34glA5|GbiDhlkm(*c;+NLa}u683D2B_ zXHLR1C*hfs@XSeg<|I6G5}r8;&zyv3PQo)M;hB^0%t?6WBs_Bxo;eB6oP=ji!ZRn~ znUnC$NqFWYJaZDBISJ34glA5|GbiDhlkm(*c;+NLa}xeHF^5qOa}{>B>o1||F(5#bLX;ooQp{}nz057Hkx!T-lV zg#XwsHCJO7PY1KV$qA4*I|$Ih!TvYx5y{2L6>{J&6AA}68yga>Eisx%t_MT*1_J+48oD%gYYCQZ6L)$vPr7{ z&5&gM2WXO=?SDTA0xSrz3mS?N@B$MG8WZZN3qS?{K*2)&*#LiUP%ofiVBz2q5Rs4} z8EP;9FQA~IU%)`a!ot8na{EG#17I*=u_)NZ;jmSV;3*w(IG!O0|0^H}VRV@;eNGE}1bDPXJZtN10*`q}p5W8KX(H&bpU%K>IL!&^ZflAj_q99j6K z!9~?Bs#d(-|Hw3{RM_NA{L!2DX3B-Nhco%>S-5avqoY1d-mUevA$5?>(i@R2*lvuKahvnR_t+^yd6!D?kL zE6Paw+K$wmDvzE|dLC8onns`u5?xE2_!x`;I?Z{Um;miiL1K>hu|9Gug$?%zhc$8A z)D+(sUMF7XDwiCAN)1HFQ!iVadmmgur)5rIZc*==j$BL(Gk}H$pyh2yQzH#A7GWt^ z?Kz`v50=-p3BG-n`HtMWIbO6!y?rlz{{(nJ2DX+MF>KQQG3e{zAaMH=RBEIWCeMC= z@B1@M8RM;6OOxo;>=U3To%?pk^WJs);Q>;Wbt%pvX@~p5MAKAxo=xt+fQ%^5}Bb@qc0a4p#2Xg;j{o@h*VIAq?<{_mFr3Qr?>YR+ej%KRt|!L)YyB zIjdXO2zlxL309f0RY-+7)sw8@?Q#VMkZ4eUzGtp7zON|F+v0z)Gq2L7LD1GbpK*Hx zVQx!Wc?Ez554XuK7TbEN4$M&zr1u#5v@U=8;fJWRPol@k= zEN%z5_uU>J0i71>N#sd0avUS`_*&9+Ul2R#%N<=RoPiCOqH>NOxixAO#eksqOHY6m zNyxcNa+*&1oWIJ({e^8`2%+f)gElC9qwa1%r-j**tbK$Ntf2qK-g`$iwRZc$p(90# zL5d&<2nh&M5&@BJD50iM1XQ{t3Q`0FL^?vG6Iw)S5=bBl3W_2MN|z{vA|fDCMMNwh zHt;Uo`+Z;cIp^Lx#`lf$y=Q!5|B(S%NuITyGJkVEbN=Rh-tPk*{>CSyAQJUf=i)-c zNEqn+@Ij+hV6Zbxlxk}4ZZDxvOLnuLSBBR{%i47 zeRoCyRonl(_Yu^wj}{HiF^|i}Nm-kmXCM6PV}982qq1LlU&j6#88k)4-vL;Eht!%3 z%w!K$M8*EPBlU2`S4m)~H-Y--AL?Fn4`@$SuebrNm1)02h8dW}{)nCWqXErzTQk{@ z!mWSy{`yalnx-ug1ewe73^klB!wiRZRc#A{&=zcDxJfh!{)d>*8Fq=QqbHuIy~Xvj zChmM*vENG2dHF>YSQl2GH~;kp9h^xNS#f^rIaV4Ou`j>^ed6wR?;d;^ifw|+e)~r^ z#1-c0<~OhEUQ0H&E&R(Tvg3FE)pr_-Q0nqoW%W+P?*qk(#lJCH`{x8!6WmcfEnF`{ z7C-+c*X3`@W5z-p2C>=eWly7Q6VE?B0oWze@<{hAz0;f{nz62%Tj3) zhm%*g#bL;8mW%*vlgfUvy$flnX*C6W9ux77h9@7+Z)yA%27q=5P=<(S?FzfYo}0R}bk;DhG< zuGqzGIjO76E+~yyVdEHTg8e zqJhu;PVHVw4KVim{9&@KJjduUb{c&@(+WhP3+*oY?u#>D%Pj_+3`$=NdyS>O-~uf=%UbeiKBx z6!O=4{0lN={oWwLqNk}RPWyQ@Z-OW z!f#@_-F=ER=|4+G;qUM;a)cH3>sea<`YiNgm6aQ8+RG^eov)dI)L(PfjN&sBS*z7gR_FndXD)?1 zYUYfRgcnLb22ulS!qwbKyT`^1grZny6)idv;aR1S34gRWwEniY->4_{o%E5UMFOR{ zv`NkgLM0~+=!e&2hT261;`h@mAr$UBSjs_|56drofMbXOzx%-|47?TXb#MN3BbWpm%5ftemOXocV8_Wl*Oh~=E zKR8_z&BjXO=2Ab-6Jx33PnU`+!L~ooblXiFe*P6-lqaTq_2bN~1@my9(2%awi|d*% zl&%V?p<>TpOMa$@tX6h+9F=XMYsb7(nQzdXI14^U9XLFy18WEv5;p((=CEV<_lJx= zqrq3-Vv#hZh|%1!Bzwu4#d*4A;f7D%CycVYH*CIMttZ%u;HoO$(WwS@U?U)1kswWL z1Q=9Ase<{%@tIj$O~rxHx0SL7Z8*SQ1SqJevVZ`BR1!encM~AjpJdhj-%iN+`}4}U z|F$uhxxalf)sf)`Q-9!GQ^$|rac&8~In4UW;!Nc)+H0{npHiUFXw=E53q91t6{~Rq zC%9@nmnba-Li2)&3ts1`Yt%Mvi;Mo{Xtu*>7RvY$7-eP2C4f+$eS>R61=WA-h=5IY zk`JC(0IU!=!dn4!-GtQjOuS3>ZG67cb6a^t;b^JfG^Q$@Yfh-I z#GCV6?QjB5%rHy06S?&9y&iqF8671k+CG8Sl+miP{cf!tz8@9E*Q7FRM#{%^*o5OWnGi7FKn_CXgYmK*I{-jA z>$G=q=g&(oRPo2N00_Hc3-Z}3s6ry0PxX#Raq2C(=r%F~NgS5Ac_E6@BU>tmKV=*j zF6HDI+=#g)bx0O@;Nh%45BSxdy~AMn%Nlp3BCE~sz*KgP*RAk2uR2OhPNwxFr8_{~ z@{sqWT159r!1ypsb=_(YDO+|T<52ca8N`8~pmP`tSLN4`XdbjkFYPm^_;Zc;Dqpk3 zyKWYhBwcK?UYUQFg#~j{5?6;E)tdKzsg~*ZeDJ}yyA?XF&2uYD*u`a6doP7MVta`W zhY39ctvb8kZ}NPkr-yu9tIeHm-yH+I$$B(;V=T!f?G6w41^t|PaabGmv*-i^0d`V~ zM*z41Oq|63#$0Lpg%wi2u%h}c49n}K{(kg z5?V#3{TdVA94-10?9dk^XvvJjm!w-u-O(4#Zv&Y}kJO0I>(Lux-rT#V26?NuQ-WhW z{ToK6Ubp7RTdCTy+(GwJ6(~j(-Ss^Vmg~UfVyP|` zDoVSdX}N<(ZrH=GL<<_fuN_eDCx-oqc?1xS{Yq9K^zmqkaXD|o^kUNy5SLsvTh6Mn zNd7_z*gMCXAOr>l>0|crG^txenRkLf@k0y+X%;%OG0qgiZ{H1e;4r3i3F^aGp*EoO z&r}h@c`zl^l6N>1(Ke0g<}R4?gu=^*hHJ{s$KYpUYk#PaVqUG+rpJaG=7T@0L=#^y z_N#`GYkav}Zps=s(-_V8pb7NY1KqExhXb1~nj%B7N`FYb39s(thwvQdotz-;#ZP^1wKxGW#L$1EGeRO2Ut~-|8y~5CG`^e*HVf z;F{{}!5?`{>&@4(YxrlU-JQEmepMzFejoS=dOdQjMErZFk>l5*nLoVzIw*Tyd)WPB z*IVaL^&c0gESo2xB*9-@j*9@tN6eF)?1*1ot|cYwG@o=GwpEj4JMjF63WE?!7WgdI zBL0modp9264^?B=bAJ!5WETCnSB4yHHW$s+?4hAG9_UGr;5<)Pv|#u*O=J+R#X;6<1dkDg=`r}5PNC}Ze(5aKu>v(| zG@1Z%g;W+@ve;;@O{8G6Rvnx4#2OE27tG35&>5qQ2MUC`KI>4Vr^tKSB;!ph;4DuRz|2;QtvcoC zD9GS|Fo8t*$MaN`n+b!zB4U3O- z@+ZPg!wW?ZCT06-Z?Q3P@**nIYKJh*!KgsBHi{!&?|P+7osz*4FYG7iz!0*Y<{z;q z5uqM~<6WakRK?+1aTZ7ut8`Zgxtyyq9X)fIt8362v72o$omA{B*{LNLflc}p?`GnE zO6{>z&Vd_j%JNk@dyjdfLOlz;6Pmf^~kMd zXDNl-ROsM`b}=WhK0~*?#M(U~q!xLN*M|pcLbz1rnq9iuKc2R$mEV?0Ub;{e z{d6=Vr@Nv~hbKCCsw6v4(%KwS19@=DXTNPf-U{~KaP<>ir2oU_5PA>oWcp}YrIhcN zGv2%r0wd9nO{4G=I7 z0d&-GMK|Nj)&4poia;A*M#u-rm)?99Bo~yeo z$zB!S279i!=RDkztVC|BbIm=1OoWFOWxWuJ4H z9aBds5v9z5ZDM_EhDF0n!9G$J>`SH~I>HOzLkz;b%7i8-Lp_&Wi!Z+?O(y)({#+a1Yhfezq1l8{jd4F z#j9~J_CuC!*;GDZI+{5Nnfqtz%ndPaWS{5K>6Y}5BWYS(PT32@&(h}T%*x#SNckHF zS`KQO^OSEnx*W9*Wif0jC&r6vFhaGJ%Y~3V_mOMs64R~2kLEGw(bj|QqxOPq?Mw{bK@C%GZthZoLEZzI-N@=3RD6$>1j_KVTx$>WAuT!D9bzF9d1dJ&fWF z(~$D5FoWw%;hE}AueqUtLB&DRs88r7E;T;X6M=j9{v?nI4!t{ z)*w%mi%cqewZNFwTwAD(FUUJfadBGe#3scFc*D!;cJGk-%=AAMsD;n0H%TyZ2V!5y zi8K9iEBRI{Zq7$zGM;)>e|3@<=;2|UEqc@@y!L|NrB^kYFG1t7mvyBb zuoZbEfNdP59Ot}ok1Vd&sdo2LzS%iNEOuVk1E+kg{PSUt3TKI#iqn=wxfn66PzhyL z{kv<5+RZzIcCcUjtjId#P~RoAcuYK)m#U(A%HC_VEiaGMKN36pdXr0(6N}1Y;*K3- ze=%{Psv5rg3Qw`pf~?9YrT;Of&qg-~c)yQ8z%Y$nG_*z9N%~{twI@+|M_G;O*3!f0 zuDpiDIpA=DmXAqpTz4ejsd@*Fo<(*H^<7SWSrX2;tsE@RIAgs_Pf~%z2iKO;5mo_biK5aRRJT;3=dk~)%( z{>+F(iTehNl$(_{`FNFpW-_tjz9{a)1N-C|ByLT+9`(Na_S}gQ{Hso>O!y`If#eYI zJnl0gqKu)b>!soroN%#`&9}M6_XsxqVQv2`xUE82&6Z+-Q!RdXCV`0dN@ux(X8b2` zjhF>~{ygMPXk*y6f3jxMmQwllfcl%`!;cWs{RfP~AfYHsLh0~uQC*q<@ygOAg58>) z#=O~MR5J^J=uUA@AN}BR0}b>{(%3N}QHPUo$;wPW>6*M4oiNy-Sswq+34cMG%0g|K zOLK!HU$yOhi6nIp@$k`YDajj9NOkz`U0|<*Try7$590`A1(k@DAPmB+5yaOj=Of{$ zy^I$niLOVoK$}{I|MHube>k8=bfO()Q}$uPiw@p74v*!+N=78i|`(feP%CYp*gVu>B|p?1n*Pc{x6b><MFQPVkyNDW(U+~hGQ6}Rp@+35k4i?a#gSmZ>({Qau=O=6++rz{2l$z8XMmUfU{8U=~C&&gihca*hfQ|3}d-E#L!eW?Lfp|JE%&@Qi=fkVEJ`1ZElhOfQn zB9(cU^^e+4q*$dX$Fpvo!R%IjC$wItH(XZ-Jp{7=LKj}3H~9Vnv?6j5Fy*0#^v7xe zA34`$d9H`SrHDY$qt#-`XM$npmvh`QHZt*p2STN2ArXSjOHS6oIgZN8*BMyU0k=qY zXMJ%MwQh*AM5kRfp_-fD@t%iM zPQ+0Iwil8$9gixg{bSO@oDpCh56i*yIkeTX?d^cxRr?FGGPKmXOAD}<8Z_AZs($Lz zz1n7+W{P$8uIUZ1OAN8ZJh&y`=r`j-xi?TL-QZ3mAxJyPPOJBAOvTWG4jBRWPn z-{$R@;ZB@qVlPtj5yL@VK7*sVgeUX-)>Z_F#Ju}(j*>gZIUd~>^p)Y2{$fv1i@lhj z%N;%W+F>SM_buQiHZesHZo7JZ`Tetj&9@CZ;9vwv-NAq1rj1+)?uG|Vq@6Jg?Q{%m z!iTlZ(Cg_%m51{6g4fB_78}o&k1-zmrg5_moh+yAmklqLo4M`hqeRfOJgwwXS^6@( z=uUNBCwk20GMJj5>5sY-N2~8p0QqHy+7-<=EOlx|v7*OLdw!|C^*OibUYnt7rC{Le zA?VHX&+nfpHR6l7RpWHW)#$D`#rmPLoK)J_fqo$d@2sxg-h5i|2?za9lumQ49i6Rh zf&KtA&_;T^c@~y;O*%Jf=*&c9xm51lg+NyU(EVw7(Ns`yLGVptJL$H)O_NaU!9vk z%r%2ay)DNZEBDdq8f??gUMeg|(qxh53%LqKostW8zLC;BY)#uZoNS%)RvkME%pa1? zK1vVmVd!yNuo$UTCDtV)gFT<&{578n2_a=uBI9(|+{V4SM@9^=vNx%RPnC^co0U5? zTK+)gOqt=AA2hYDe#^-G&}HU15Ql_-7TFn|tFm_Zih2AoZNeUY!RGp4=*JZ7_u+zq zgWw8^1jke&-RF}mIEz*`ZG#zb%^Wr67g;6siNQOV+P+LPhbfA}#UAdP?MqKi@173G zZ(GVkBO(-o!6#D6l@;@*XpJ2?Ax}1X(unCB(vGLHd(~$rhRwWI92Fq>sF!rR0}=u~OY>$JVt9`GdUoS4K{QZIb&32Nb+Y#g5V~$>?Rz71D)Y^1HHV4A9jB`p(b!T~ln}1L-bQ zG|RS#9YlQ!b+FU%iE^xTS;i`jn;wq+_NE+FR&dMF6i3UN5i1Ci25EOfS|&oAB#hf+ zmfCl?lR`f9$|mc}LO5EF?e~+Op<4FZd>eXhDlBC1I#)|AJyBPgJ-_X7GsvbbyoPD$ zD32C$^{m_90yp_HV9Ex+JyqL^a7eBnDc_?&7~dml9dhr{NBTq_;l49$P-4=yHX&T* z4EZ4JYZNAKhBW6@MCdWet#gNr;V&ERty8@6gsq&THzG5kwO_8p1mPAV{e!%ff1N7uHnIWT&J0#3dW^;*lNZ8i!ep z@fLl0HEq4Uw7?fa!#EyVIi|OdDyz#NlcYJQOXHM-|uq&dMzp zSQkT29Z&lh1o#{M$D5_|+>|S>9L?>C%FI7DwinmZ`@|MDw?UqRrKyGZNz~rvGb@>) zKhK&FGyC^h3V)k=!@bjAmE#*S8$6hqU+!X6iqC+3_{y#Erajo>q&b z8>i(Z2@|z@#U_ZL-Gpl=^Oi8+L*_q07lyMxky3^2@GcjZ zowB@~=Ir|)nAl`-ezLpp=?(5kdqNc7wWM6%WXEBNJ@St#6hqhGXBs`&JJLtZYoR#8 zzRj`MMtqiMnDV)Rt%F%cYeFdAc~(~GNFZ(kP&+XH4AqtoB0%ivg9veUOT~}#PBNLN z%MbXb9X|{Y4P#Iz4Tn?XjAyG`-K~7{({2l_kqa~;VpJxYO_m%F8oE469Hk-Wn!JMA z&oy#P$d2Ba-0~#cQ59n;DA2rrF#UEfltd+6#=ijv9vMm1>Mhkg=6NGXAKM#(kqJ^s za=Yk~7?06U8p@CGSZqR)l50B)+s>TpMBQyZX0E{FznJ4treiYRQilO)9 zBrId}rf&scmF6Ua9iXX>x_Y&TKIV=Ys9Xon1ysvlQ-6qG$kto3&J4xw4(HLEA?q9| zLk<;-^gTY)-Q-kowkUD5T>DVe@R^PhQla4^lvt*%E&gIu9AD{+)nKH4rknVFU3XHA zmJc^2PpM+w4PLcxN7z>5Fn(gpvZ-dP^pzED0j)u>%yd_R9LuFtsAC>WMR3iM~%YrNIf ztPX_BNLCdc&{rx9rur4?ZvPt$1fDHuOGWYK^&i>fJ3WQAzS{H@O!8Bzh%E#%;t zcn80dq!i9D=^^wkL~Ys$XwW@0|MN>9*bVbfIIF+7!qC8kMu=u6<@T{O)iQC!;wre> zL@hswOUa-4D=dqHRKD_B2B}r9)4Mr8q0{)yYcoMLE*D1XZtY_`iL}YF8*2|8&x7v~ z(19!MpiJPj{3X1>Neh|&WR6Rdu))!i!y>j_5RYW}E7l~aW>#8*rAwYIxtd8y_Iysk z*ieAm>|S}XaoJV8>OggU-3u@NYo|@$muj5$@$0!{qz;wb;SP$xj*dmP$>nlsSvBa5 zY2F?!N^J_X@yXsf;dK)qO>@Or>%4Lp6`dG414erh%nZA?q4nn9(E9IGD*&{XecXqa z{U@|uJQzGJW48JF!J5}IC!WoPvM+@To@rYGo3e8kW_NV9b)Vd7Ix{gJ*Yo4Fs7acj zYvzLeL9P^@rkEcQhR5?njXkOfO#vYwtlo53-Gd|VdOMX#A(r%TFK6@Nn_MR(cCFJH z`g@(>#{I@e)~lXg@RrZLvA1p@Gv9S+@&)njrkzP5lUn&bWvq}h% zLiavVk$Yh&pi37#H&+s?a@xJ;QLd2+zQ?>VL%Q5!bMV-+In6y5$jdA0+*+C_D+T>9 zO6A_UGnWhm!k*dNTCYiImc%6N<7^ ze32Y@F`+hxzqokoXxXGu{$3y%=-lv%;_}Nia7tvuvo+ehS&PK2chOR2XO*{qul;!W z8=t^XaKR?m?o_3>W~)o-jq#iagiw^W32y;L_yK(L8W9Lrh`Y zl}rE{Be+89x%hU-?erH@lwHQ|Tp*dAId59-mus_I4Bpn>9d#|zlQi zaDYNsQsx4*vfa|(uYbcBh;?4@(ftx#Q}(&yCup$tI3xI^q|sa{{^Tl<-&N&wxBlcR z_$R32fwqAC`%|xff)o}_pikvw9~Ls#*efdUZxokl``9jIzA;ic(!V$_QIo+#g?`d> z$?HG)j(~!^tE`|ffyjyVg8F%sMN!c_7oud4pjYHl7!`ps_A$U>y(vn7p|fj;Lic1< zKWf7Pd(suDm+-JPGh$L|XX(F0eTxUU7M27*}G2CACs$ZMMRZe88-*Nb=g zoq%Zea98H;*X^ONgvSV)ZyYf0l=Hf$y;Dg%SI${kkPEQ>Ddn8t@=sRQ#=8^*E)0)yD1%27IHjC> z*c4DFbPKy%-`962k5X>+UT{j? z6f_RS0a+^Bu30Yde3u?IJB9VZG7|P*a<*Me5Yp&4Nc|49X-8mpu+1!7@wh#Uu8J~9 zZ5g!dhLM92mx76yhEk$PJX00mQrUq>HYktgd_|+>s&52etDPL|wz`a?Q%M^~TAhue zsM!U2aFO;vRV&dn=jtY{>753+$2LWY>y>Uh0?h#SxOx2lfjQm*MwvslY2`_OM2k+1 z{>`!a&8hzXb>!@?z|rCg(8*(a4gQgNa&G#;6!hEm{_b5rB)=K;e7#QyMvKEroTtXV zUQR=(?cWzyNkm6b*#}EdByNZ>u~CW(#0{hyfp$X3fWHxI!Up(X3n?wXju#Pmr5n%uYPSb%>zOHU;cRL6Vh;z}^(F zSGmYlFo$g_vf0jI6jIZc7dpEKM~-r6vNqxGJd*9j(Q46Cc{%PJh5d>hcM^xaYmm~F zsj|Hh4!61GB$+NZUo3d6*C~ zp`pfHl9ky5ApdgK0Ne1uc^F5cWz+!R+5zlRSDQg&z=cQ#=~WVhU~Rva+qRwZHkcG} z5b19U;BU%43Xt!oX+mK_<*z0O6VW8703e_7K=9u3pI%3z6cS})nup*FHaCZ+Vn%Zl zdnw>HcL9vK36o>3`p`};4UU)a{nx*}TB~`AEEwAKr{3N*<~H=bkKuICaMOuaI-B`$ z01BZ=NI@-NY{py=Nm{~%J5(L^>jfPXo(46`<~xpHFZ18_CODmm*jGvR7xS$_ zZgV__%V|qx^ytz0l;V){V0w@WG)*bZfL%WaAgMRN{)kb6iQ8e(6mTQaV?d5V?u3-{ z3mYp+DwXVWi1MpjTDm&%nR??#2j6Qe$wM8{qQb}^dUU9L!YE9?>~U~r(oEfiZnPYE z#yH9mfyV6*3hX?c+VvU2e!ZN&NMM~m-p)~me-ebvjv}Pho^f{p5-x5+{tg8G4+sSQ z7fDyYOZI5Vt?9GB=LH3fNA9Q5Kj^5ZK{-Wt{LJuvz?L zA67&mmxEA$WM%!ax+_ZvF4Nd75l1FEV_mE#*(wfiNOZWG?Ag6W#S90^VKV~zhTl%L zM+-i^}Tv{ZuN21UZ!DB#o&<=n-fVreZQoPMK=Ydg5rS*!bHOLKJ-+9_w<0kKU1yYl; z0>y-S@TEa8pm}NJ?{iVoXdA!dZI6ay-kdJGkC*<0QR{QtvCZ_lD8X_%8<>wP>(n0jw>kZ4W{Ur zLUSLa@&@b|Wg?smZl!T)@VP<}yqpwv;!a$Y6gg>Z!e^*3(T=~n734*e4?H>4J~35P zQLk9-lFQbOJSBLr$=~?K?(iiNQ8kDgBtvW4)zb!^chF;k6i*j(53Sc-uHPNO?-n#0 zEv)HMc=mqy8LykAogGOiYlPpizxDjTPrv^|OZ}&895|ND_2wL_G}p+u%=7GpD*X8ko$7|} z!&|{J$$LY0u`;pxH}31sp=SrMQ=vP7-F>Wyg$jRm3_>}o#4?79+v_mjfrL2;Lt_4^ z=r9#18dajT9T8RaCLl}SXzqCxg+(UcUxllk_ME^~R%cGWXwo|{=lwB#u&?u+6U;Fl z|5a`s98=u(M6W78dGG|`YCuUQLe))I1~Tt`S9O3J|L}#Ob|%hsGH!`!=%nuB+`0eQ zckMXcW!ux?oFK@&XMZZBHUMua!CjHN+bkU&>>qIyTctNcAsA9FAwwQN^8^E4L0>hq|Ak-uf!sVL7Mz3qJQ$19Nj@Z*3)!@%l zB4}(OW>ENTT8Og0dFtT_=|24JsD{X``VWpv*1-U$2aL`nW_M6dG=>2wwI@fQ6z45`#aO zs=*C5oXG#>97S7&6w&P>H>kyF-~)sSDc0-DTJwzFGYUEJ=o49!8TlnLW*=A#+t4PE z^kBKzp^}e+>QhZR z7e9i(6UJtr!M~SFPO>(f@5rmtu4|gZ1zknyc!|s=-V>p&I=nvNF9xut2d|4!Qtx2b z%GtgQ_s6Y~`Iqe4LniC_ci=PkKe4La{S)-zoz*hxMPjbz#{v1{WI3>sOME7@KS)DR zrBrurK(kGtB+5zPktjT{`heA2)t%D|ehQRRT9p=v{bI$Z#@2wsKjD8D{s9c#UmU=H z$6sgJ52v?|8vGe1@^p{wy#7^qU1*~c@P1xC?jpNEik&%GobqQqjC6?NUB2j_pk%zY z@BA7ni{dx_;k?kZug=&$>Clr=^L-ckGM)razp*{)pw6x8cb@ktMuTvr3B?Q0KLNMFDi(|6+8Z`u4_#2c#L?o#eTzYzn$-xsp7I^H2&VVez=K+1Jg|(z>RJ!u6gxGu z_m-o-*2aJ0%XCX}XE*m}-;*u2lgBm7HqK_HG@&e5E>{$rNN6aJZ~e54Q4X*GTkJ31 zX(3IDfAz7Kd2UyGOU^MS8%|m{rY1d)XLou#kc=O{f`*+`^$AgdNl%Z6%1xILOk--! z%+2K0EX0Xq>9Uh_4YWrLQySx*m%gapx+ol^0*# z>tsT*N>j()#=Nrg=*w?(1|gZ9m;28Ta654|JQ@R^;8scokp=*&`LOr%rL zFlx7m^Qu$2*_>EuNsg;!)nl+RTB-vSgRjfFlgeF96k*ZpC7wjmv$Nw<4e)aZ>f$!& zaQv{ZidVOyO^}6{Q{Z04Vc<(h%Rv0p!=sV4yH9Nttvc+=6}0t#^pt1W;WqT`{zXv{ z#}CeDpC7+*UV%VCx+R5+OFKZG&I3Eg>=Fn6a!xh6~Y!MulAGs(-NC0}NFU}`B{EEGa+cW%5j z39$Hc zII#p%_;CLnK`@HYF|e%`xFF>YT(Es_%n|;s@{794OUGX8G4=rUtlfR2M)id^4si2p zAi4zMv#KQ?G5ARDV;WfDhgDaCmsun)f12DwkxX2T#=VgO4<7j<=a3QJ3pSkG#T=q- zJBjn%`L=N?xe_e)o^fn0myZ7ZCHBIshuNDG^qK6oP;gQh2g6ebAbduYa&m$%q{0;DIoK>Pb8v87;GrBmB#xN3A(w1^VQ)IhPil_9Vm0I zY&1ju#ku6MAiYieyBkIWCtt_r`|3saDdl7q>a~JRP)eGPS4Z1u7oalVsy) zJy^2!fMS7AZFnwvfRV_wXwJzTK1D$8;TICskdT4ulz1P(_*24niwAc~E|QRt=AG)n znZw0OxdoTwX7}kXQWP`;8@YOz5;LWBFr~{|1}}en8`4ErBL>0%)^SCy67X#qyV(RoAIma=KhnTd8PXfM|Gf1VfPU)RMh)TxO1A@mQ z_*BSGBt9Iy#rsa<98E1%G$9J3 z)v5vCFef}Zp?xwHoX)pnp|sz2M_)ExPM0RPYPW~r#@|z$t>G@>$RAECpNn60m6v?L zq55#glA}++a}G|Fl!gPxg~r;w`iTy?Hq>{H*$ooi;0B>Q&Mh&8>rbNj0$E8uPDy+` zg(9>$MV-B1~+m@Q+gHz8ixX0PS>x3CpgU)B8+Tob&mxWGk?zyt3`EIkT=j zX4d7X*KL=$toSZf;muhT3~N4!*fxEK|2BR99p>)u_XJGev;RWfZPWL`|H1Ulc<}Aj zRj`4dr6=Pa#PPBg}i52eUIWgr-3 zYYJb0!XmlteAi)2T&Q11c<-j_m8}zdHuJ8Q{3v&fD!s@~O8R&j7kElU69M)r_mYBk z$J?kvzc^~oi8$+6yBt^+{L zr=Or>j^D7KeIiQ+WgfciKFJNus;Ju#AIC!2AM_6!QSodRMRbLSRvrw1GFKtQAi z%pq);6dC$(_r`}UF^3R@TX=s1_iL5e2CW^2AK)Ai&OFTH$0b-%4rh6XVlopHHIxeS zD=$Z(AvKSJjW6Q!EN!*Am=Ec?kU7V(lkycUHZoI$4t*Y|lH8e(+266(fdcjM*X z^+k*1Fx`gkT;VAp#iwvt@$){HiOOsO+yJct_^YqO9L~cF4yFNN8|wSKc8Quf-%$CX z5>Y`OPCnY_iophS$D4^nXRko(5ng&*dc0FS=n%fdHsy&flUEX|0IT_Meewz3?`Vc& znF8!-_97=LI=N0|`a@>Zr65i1c-N2F+4uKINIO9n^=gVyq6DO>YC%la->I+vb|ME5 zQvkd@bEf|&6B91>A8gqFRH8u7E%uu1VpCX7P{JL3yHgSyTaKgM8|J6qy*ZObN}0_< zUWfG7a@X2K*3I5Ib8S|y6I-V8uu9sKONzPNPdoC3|D?#rXo&~dX zr$0DZR@ycMD8u*Py%(t1BVU{Uh&>A1P_LQu6c+d5W=zMY@#4CYV@5!2E)_;~Z_2*N z8S*IkWJ6BQ6N@GHcZZ5^M5M(BKV+CCV%$I6UUE1kM!vo84EPjxIt!bAqM8J7Y(R9a zD3%U)I!_@1ul91Y2~Wg8z*NC?y| z%40SW$*aH_#^9Bos>*1B>N?IQq8d*f;kzQOz5t5Bv4v0O0m%$%ZJGjXE)Ll@c1Djf z>*6!nc-R~08FKx{J&hmq`~)50dz`mg#yEW%Vf&fq1aNpl9qr=R>dy{$(`(b%i!3<8 zFw!EfgTg&zZwgk}GC|*6=n8oD%%NFA(-(s7x>&eeomy8L)b=jGH^O&=o55`ErsdxL z6)Vrqf*c8EtT;L0_%Y1AxcoR#mMM z<)Bi3CU?9gPuaRx72FH4*HpxTZIo_m>bGr50%j%~2?B!o5!EfgMdd(3MW&WNP=ZSe z!k(ZBWP^JvU2TjSs!v%2&)~=aiKV_%QA*p&q;iH2QUw-pkLUztA@x=*%^|{+J5>=) z`@zRE+euNIT{Ukkx!n_oz_ZPzkJu=^6>$?dL*gw7@;CG;0++0-I%o$)k+c&;25!|V zXt$bOEp{>&KbBk;IC3P~@n|t`6jWyESVTCjK>Gc2P9Vx|nvt09lqB}tbbT3RWRGl6 znO=k+d*ZS{Oi7YOnp#MZnacPl^JArYnw>8B7kbQ92%|(!-fjc>%sXo*ZT%ekwEk1_ zX~eoq3ZPPGIK4(~szOz(=z~^H0`$JeB6|^XAy3u5n9vjcEu>lI=9eL@M&B67gCZBF z3u`owlCH^~n*YV#dxtgEZrh`w2~q_E(m`nfL|Q~_Kv8PCY)*;=h6`4C=g>;gy5EOzC-_|d37f{FP?OD?7Dh& z@wfZiYU*G2|KA01_3r*HaBF7#wmsvNTtvvj*cMAVmHo zPPvJXJpWWQahl&i#JySAzcD$(2e*k67Nac_Pf;+1cid%X`MCCZgw%^)=(h4CE;+0y zmZSH(gH^G$>qZ{@AiuTc2<>+MQ5`&J@Q%8rkB|O!yjZ;_I!H+ql(pYrKM%=2<`dDq zw8K73eZ$5bGywjt`$YH`coqLln=fJCJw0x^GKErPxqNj$-=RS1fZNo6SLavz+$@uF zFyM0hoDKXrbyuZoWWv%e538SFDFk^R8w)6i7UK#w`>2S*vjNz+or*e6a|we0k$^QI z^3LdfLHP*{=W#@@_b_?MmMoDvQs0x%@HzI-%+_4)!TxO zG{rx+^viM3rYUjEx+*V3v(=_=+N&k9k{j|#F5f8k+cor*3A5xt+}p03Dz`&sy&_e} z0#>pG)jbumHC3(j2lN<`J#vcAp-< z@cq;19sYKqi;C}RT#Y>RXdgGbS`sM>}I5$Z!s4PRt#hmaHxPy4rCpDr`y;MhcS z7%LAA1DF0JsEgU(zC80qmZGT2MbKrQ`H7o(X-;O3eh&$D53Q}O*f ztu^>}BF!H7+R}EkPT8|ik}M{lZ4I8bB8>dSFLsO0`$|F&6j2}epm&>mYaBO|A8u2h zjp;}{Phb;;CbKh5$|9MeEKOKmYFCX8pcCPush!ShwP7DRar|&8Ac$C zzm^MIvZO<}Z!xIK%1Juk)vv~@?@KEQb5~CogCz3Y?cUU(oV+=uct%Sz*|dNAFnjHO zcN>4|v)>>>-(vFp75X!a!W%4EnK8ad(ZnZI)+f(uk|VbXSjG`CMvB>7YRX=s#cRt} zqFRw(GT7Q;NQzgT(UR76#ofJpMO3rGKrc8k&;u7)sB|cT%1QZn>X$$qP2S^vh`J3C z=U_4nMi}>s!rXhGaOu$;t)OELa{PzFbX^K&oFgUUmGvl#?WZ?m?mnbTse!XftJ#;P ziFC$&7t;-*-&f^_YI!i&7Z17U6AoF^EbZlUJSEm~QtR*>RA_{fDJSS7(#gPW^QmED zo+Ax`rZh^k_hk&^t#xsG&%piZs?erAD5?|#+W zO!myEAAPbDC}Duf4G1vuu%bU32&Tb^D)LgFkbVGa!`u(_-?lG_lC!0H?+?5^$l z<>E{9BU_ynY&Er0MlOA-%Mei^fk4hty^qDZRKIRYRt4jAs@abMj=Pk)lQPhQH`?S8 z)e=m~52(2PlU?9=g_|oS{y;y+7i zU~}XhTbI--bei*XvS0*>L4E3zX?arLQR%xJZ0}G4$5Xe2=Ur6gkBtUIg^Y{Mt@v{{ z)drqEnYCX^$77C{>7ryDx%v(IsR;8^1l$mlZ9u8!n#C=_s?yR|Cj(=#I1Rv?uzzQC zm+|aY48+eShhq0^^lWHgqA=!(tX8@K^sa$<`$fex2@8O;26Q|OP1uEY% zN(>Ytn3ZevcXjlaV_`f9qxZ9M0Po9Qnc@xZL3SLyN6yG&(&`fX%nm*WRz?TIu7VFC zea3kA6e0_Xy>LZK21+zPg>WfH9I6_}ZP*fyRA?<97G@8IYK_nig=0go<8(s5cm4*E za)qMmn71|!@q_xT=R|C&B_Mv-Gernx!FI}IbG!%mmP*o@pdGu+Llg>oehA>};ccp- zbI>3*Fl;Ii2mw`~!^u{T8_~;NQi8XiqVrt!mH%|x-j*g*66s^(QKEma8tAIl7y1>O ze~m=_$hM_L=oLY5X4gtCks*XtK~NyPXthz|`LuYfp~%r<9O)xiPw+;MgC4QB2%0KP zUDhxuhrv519&a+n(aaN+OKnUcjGUbA6N-hpWj62=Fz<14eQzT*j@Js(QD<B+G~m@7!`_LXmA{so7E~0&2bna>7b=3gRkBmRu|8fmh_35k zLbI?B$1nTol&-~E7WDBax{3l02;!Cuat12sXZtSpv7dNIV{R=maN*bN?e@FbySu%9 zL9cy#<}Ec2nvaJIWmnuuuS55KN)z^K%TN3H0OvNsxLBq28)MTb&kUSkSw2O9^)TNK zFd$JfsCA3Ec92H=fiM1HB!teCrUSz4z)O-fWS00#bayF^`kWgmgS*PU7)7slPvWEnZMTSjK%} zqsk2O_o0jrqu0mJwqo-nzg?>J1CtvY?)T&!LZ_K-*&XjauNb}W#B^TXUmPH7Ca@#m z4R-ljPPrXWRJFO=a28;e{Np#sK@CW+<%f=x^H)T>KaSptyO?#e%_g{@seddo7Wzg> zHJn;B_Kz)cH`tZ(pbYND%jOg;1g$xI=``mu{{jD%-g~e!k#9k{)NEl0DfMb^;u*dW za=wb1eX&H``TXsQv-{=HR)FCTwDJp7ma0jmPd{|2kHgACTdyyjW0yT=QLOBtM&hxu z5(0ODg>g1-X8)on=i~us_hlW^-Wp&fNHJci#}ZFVTD)4QSKVxM^)`*TTe~oedhINI z0RYHe`#$?udi&?n8{p)we#%k*Z%*!Gb!A~0d+I{G$A2pS^m&gFuZs5h|IKsA1xi2CR;A$=m)BsYy82q4G+)eUay zGE!?m6b0BqY-<^Hm+#p#9^7sREXHyXX1p%&l|#W6p+q^!XZo0Pm+xWpnjhME3|?bk z{q(ts)1j{F=Y~rxLh&l;V z#*D7j9Y+1uqHY07+>ex{%M_b9NMkMBe0f=HF3v5O_9hDyWYU9c<+M6gp){a{jDGK? zLi`6v}uYuJB$wht^w9a1h<jEa;ff~X5D4JFRU0d9?{Umt}tnEC$?CF?4CH+RJ=DwMGi%8DqY)Af(f*7eb{OgOx zkM5_VQInMKighZum3vf|koi4}_U*aS?gzym>W-AkO37N zHn4n90M6&N=T}sHV25?Rp4S>Y0z3)hvbtQ@V^1QA;Jg|VJb&73G?QVe(_%UMaJmsf z3mC(3ALI3h4~5j%cD@)jI0Q9qsoV-6jSt|U;J=&%KpNgq*w|l$cHpi4;%MXi6=#e8 z=@<*d1c?+a4^#dOg zZB{#K$!+gVK^8l6<+m@9B^T7?Mj2pLw|R00C*{>j{uZf(U*}KRJ{km6)zmciZJsen zn4YBSaileE4m{cE3_1v>0?aO|9wyeEZm~;uN^OK3D-7=zT*%D#(HT^G_=z}f5qvRz z9Au*Il%`N#8i6=f_)z-G&(zC(TAMR}&#j1w&*70{M|7dFzDX(jo0=GWVlloW7UaQi zP;mXU3f@?_m~A(s2aVJhz{n-2^+MJ5!ns`kP~y?$4dS6H-tDU8fHj3Z%D$J-dIxp7 zpKsKtbnT3NL$xADI4`(={i=@qHZyzF+{Z|N`^(~~?s#o?`j18DP2LUeuh{|*GH>%c znM@wy%+YlCEjHQLB{-xcv(@>q)oT79jJfvW-N^rHYdHKa$iLtCrlWlh#T>p z3?I4CL>$mH$9w53b@jSxE^Kb^5$~lBfHG%wtDW!zYU7v9hkqu6K?|<>$(Mza<)3~G z1?M3&OlY>iFu9ka2SNWz1OJ^g@b3dX{$m;t`44Hp8L&FpR!;(oprB9)8XQNUK>}g* zK;fPZJ;m+gwFiV@5OJ8YPcDcbp$ymojYW*WrjmdysV0_!;iGg;eD=f~4XiHI+3@>5 z;XtYwAX*Tb@+$!vN7IA2#x)?8%pyQC%5XrK*?!%GxGDDl&~d8Xh`Fw%f~3DT-Uu3t z1L9(II~ymIzzz*)LS^q1~(AmP9P zD02cN=bf;+lBBD(V((Wcn?~f9Lx!Cej7z*^TV2KOM(*+q1VXW{H{VD2YQ_vwEwf*5 z&nXgsXjL@jy*#RDIeStf!a~l`2*(>q7U){36yiPn`qR+fO>on|H%iirBZl+avwLX| zI5}k&E9BnG0W@m}-~j=uLuCT6|MmDhWjwG3-#s8CO5@!~tY!mYM;ySUza+#Ua5=9f zJ$N*EIPJoJ21)RBHCh!%AU5<@1O)>0UvqTU&RYHopZ^v{|F8VSS6hydhFeG zeA%N1+u;cud4ZNNgeYDRt)3zVP6jL-FJL5Nk}CQ5lEr!W2iyj~K}0XKF&8HIsjbiI zWu1C`u@$+`0;OormCA~(JtOzZzRWGZ+;CK(^YIFZyb#?6;PjUU2MS3q-Cd6u*aY`| zN+*t{e}l%$BzO3DmKdl>o8ejjV~qNRcJnUOGwP$=hULV>Z%}%e+t(c|g;kvcSCB~Iq|BI={M-Hp>v1xDZ`zs&j1B@a5?WKH9%EsGCMm6xV-7Gk#T|e z4cZ)DO+4Se{Okt^8qH|^3H}BB|8vm49_4QAmC_3O>OksD$f5f02MnnZT`p!6Ui7;D zIcPOy4Gj)$IyjBWMhNzScrgfRRftHYG$cDjnh?&(Dc!Y<0O^3z)ga74NLdAuqg)VR zAQeTLi@6RRfQa=gKx#e8#r|W@22)Zq)_5(9cqKgm?mvJ6n_&%pIUus&LQva4U3ERd zkVr{cqJYfs29Qhsl)b@#$OOr0hKAB)UvK9FVPj##A(qtyva0NyN-Ka%0t5vGWN~mn zG7FCT42(FtzQlmpg@f2bz`vxG8AAX%^-3iLU*8VJ0cE%_PpAzds&iW*aa%q9t-!U( zVFzSy?Z-0@1$ldQUW9nFuc*EWu~x8Tb(u3)#f}5-x4iZ^a~UC5Rf>(i^lH43$J)oa zoi24W_t3V~{c>W2EBD{QKEkU#4_KnK_jSTvkUm<|jAnr3oqE%2u&g4Di@oF;MBb9< z)r9HTF<=K06!ao)C{Y$*M}9Bhz^-tb5J!dRIxP#hkWp%xOZ>9-gg$}8uS__@NDt|Y zi?SsT;@Jm$RnrgLKGxGrA!5HB=X_)6d!T#gi3j2JFE)O|*IJzYYahl~Yv6HNNZtl| zQh}Q&TXIA@-Gsg8`r~A0e1ug*=!%79XVip;fZaaU^>8k5exp*PhPsuTkY^`|fLFqU zS}9*~n}^YfBjR&#mq+t&_gX)HzC=HGD>h6w|D7!o{?55yY^WzqGoNc>dG<{x!UywDpGHGGe^vJ zG!>QJ>G%yIE2*ALKUa@-6uNPK%G8kf3tx6iihjH?&c^j?nWfRf+Ic{Wc<(Ec(N~!C zRsMxCgrI>zNmH!@&s+k?V1dPt>?nx&_BJxb?4)>jvdWMxu4b>U6}bfGJkU@a7`dYC zHa%f#ONgnEjK81h;cZ}I^fM%*mrz9=_atUT<2|ZBS-c7jQ~1<5Ej!0bOj4^clj_XEK z?j?|@$}%LhzP=I+=vr$&{Bh^%Et}YU=SBD)kO^{tVPh&LOl@X(_GiFgn6YEEbKWQ( z4DW1@_7-v5Zph_=%Fb%5Z}Zy=>0{)(>b<8V)O+arZp!INjG;{hi$Kk2>nHi5?_2po zA*E#PK+mT8FtBwv)tdPuF#J!BzWD#!d;hZn<-evv0+kdf^AL@?mfC9Eg&U?Iqrx~v z`=G?u5GmdzMPOza6p%9vNw>E%qX85W`wrr zd7vzciPOZAV^{;*B)x{h`LCx=!lbc8Gl!IKc!OJ% zHBh5nJe9$tc19d%!?$ieVbK{Spxf!(I*!ps_cE3_@_s5XyHig<&By9$q$jW9wkvZn z{AaK(`VKzaK?f7ZXzifc%!FP?Hq{5xD}n`Z1J6`VZ7>^Ha}XOap2-B_KTvz|U?V`B zBs_OBu@hp@2=P5%E#>*|Z)K^a1U(964YCU+7Vfo{GJfnGHPkzLiI2^i*aa81SKd z2-b$i0Y4>Z^4;atAgiRB&w;}y!~ThS-Xqh#wW!nUHlZ)=W*9zB|&*~foHf&W44gMT*$y6%pZcvk|7fd$QPPgOoBrT#>y#+6dD2z@y+FbmEaKT#C=&{%mK zuIlXiBjGy`x|AY%hxav7oxtVsjq;CWI~2aE4tnOP_!KaLD3??!jG8+*(= zomf}wp}r+2L-M5w-#0_(N%FC75=|s~A=v8tD`nM$knR@Gw?I*}L&VB6oN6}Zor)K^ zn?fW4rJ*pStCms~rVMea7Q>H5ssR&t0a0aCkD^t%x-lB4%!ZQ0E^_0r=V@_P?YLN zulH$B**iEVsM#-hbSHGw(`xnGLq+p$8|ue0E)c}mN!Ux|4USc|mAXlz_OuxAR02;C zy3uL#Q4SikZ!~aPu+eK=F*U(;7rWX**di+>pM%te(mZCZQaw;d*cVJL+M{Yi-C@}F zgQDOTbbbLaxM85Rq~O*=eB|4WHL2Iybu;jZZyk26C9Kt^^O`Bv;^vw_F{vw-tJ5JU z$8DI_2~_FZM6&DkK+8x-^7DTO%KX=B{>c^_fR_BDHRDE{eEVC!q^WVr-NgTMi<*ab zwW~h*%0}d`k&r}V@n^fLXy%CO<3_Ti+l>zbxRe2q>Kc0?6tKf#PZy!5h2vN%PLD-} z(Zco_;d_)FV#6OxpFnaNZ=f=>5%>~tkftXZ8ute~9w-$68%=O1uyX`Aaz4^L&csEt z2lFdQ6RAw4$lGB75Ewpou=`USu-CP@mj>KV3kxu5l0ji1N!WPpK|rqp52S*HwQeXN zjOcoOamlUZOE^YH&ePuT{D}iwr1+-brmPB^cs4I7T0x3@X;k@^RQx%Y_Kb!iaFRP-U4B}T((QHc= zWkJA=={`nxrmJJ;vicGVw#yHMA;)?k1Fj}Dk-o2E)S=0&`*JqlzbStK)NaadoR$$I zwbb_#L@4>N6iOgi3gFxAb~zSwb)msr!j|ceG_8)I09QyFo&S&?vn5&eY0xd!BGNv@ z-FJ|BsGlCS6rzRNjN(did$g}QGh1~!=$67YxjcL}_Ne!Qcnn2x1BcZH=|x$5Kl&|o zp_@@fMnY8pp|L;E?C-SZ9*aT@eyXJ)4hh-ddc9L+q67s^!7y1fb~AwTbjTPrqAeCq z7f!z>sziogLdmE~1N{`-mK_mK&K~QUlO(E?FGRHnnSt-)a{^A)3UwGO4B*Fom_xtY zCs&6_WRI|-ViOM_xQnNhVRN*kg6RKR`jTmwX2hS9NJ2ohUHbX zW_;V|hutu&ZB0fzu8261Q&pWd`TorL!_1G=t0mS-+;=#jJ{$VCr}C1?;*l+wXr0a6 zeLO?T$AYEZvp3XNHFUn6T$y~StffuZfPNVBw_jV<@snfJm-scR5sFm1iH;5~Ri})< zzhS%yP5G!lG9zKPIZYYG<|5N)H`U$f=92|&ExH}ODF+NFX41aq2_+w0SLgq_{SU3d zpAFf+LE;C(Sft1fnU1e%LPaJ(L-y~@2w(tia(dU?k5<8bg#i_bmUyrGr~p!@KuoMQWqVPpf_(SsMn6;B;eNu{L8J`axS+6P|tT3|;giTA5`XSs5fRk`s+D z$d%L&Ax|AGh9N|6ohEB{xU2kl`T4L&1nhW`zx;T`T8k#pvQ=Wj)a>9M=3(I-~PE+=vM@cW*ZaB!2tJR36YDk zr-0yaaUui7wiQm0>h8)(#ljmk-bk}{BR*N;2$|N1b3!V|B zRNlJJh-V_-wURm$5kl!`{CdNZk%W@1tE~j4%d23ilTTVzMaUnkc(Y`2X6B#-l1=bg zTh|Vo_^?Q#$f`JOx~|x>#G-c^#a^cB@wt>+JT-P(iz9OpAp21!TMQ}9UaRuC^mAqe z{@79mvjW3L-h(~Pw4GXX3m`cX6UEc^6mE7L>_P2+L|ho4{Gahg@xkimwSQq>a#B9N zgsVT@Hfu z*JgRNGH%|3K*@9a1ab%CS(|oHUKu8H7ljL;0Us9}g9J8l&<)jxV2dOidW_~`OcaqE z8{jJgQ#sMH1I#9&fQcuhoqiA-W0MFd9)O+bx5f1U0<gWIQBAe+ZhV{ zh&3Qscs#tX>~JJU07s^F&Vw8%Xt=|E+1x)(aUGG{2^yVma2)hSJS9lo5!8r|adYic zp1%!?{OpcNqC;dFa&FdCpjA7$ZkD`Y*>H&5`VCU<YgoTpy+!`s$Pd>QqJ-d?EJ=n$1#uXfb1`Cs4*h zZW5Ifo_Q6o)3jSuoA+WiV8U#fEB10pX@kfWqRZvRDF#?^JmWB*6cRIDxFMAWLxaj& ziZre2vP?ddvnT7~ed?3D2NuV`x1fVV+cM%9&+Oc#2h+xm^9aB15ek$_>eRXE)p?Sy z_YkY@w-q@#{dBGI*daHrJHtb>xd`#p=yqr+$%JAo2ahI_EYIFj7|%XNvF{h@Nxlzz zuzPnVy{_X-Rq>6OV0S>hAJwKaWB-Cq0i5+Xf>ZERH#+>C(?W)6JYAC-zB0ppORh&j zT>V`XuM-@^I;16^w2V<+a{9t$ZgqH&N$L&We`-sFsj7JLtG%h+t!~99!+m8Co`SV5G{gDl^fHVRYXBPmg=@_Ksu`psX7_pa^>y(;FFx4 zq73UEjDI4UilHYn%DZ5(j>|_#YElmzZZFaflGp+<{stN5usDkrY%w!qiwdbG6|_iyOAdCDVfewPD1*og;D`hR&~$4nGB#4!VT3ln9u5O{#jxYBYG;$1 z;u`PB*gzOZ4i`eXcn-3a!BGat-CJ5^LJUi<*0J+Th9QeFMhP!Kqwe>fcQ97H0{0y> z{1T1UwJg^yv0|P-DD?aJjZ>>#h?PCLf=oHf|EePA+>+QLJH}#_JtYP=>A*3m-5}JW zuTSIqrg8J3X6!z2@JWpZ{fl>wmXal>cFj_vARuZphUW5`}*} zlVvwApmXn)R8QH=<=c?BQGr$;z1QcR#@)2l#aPcQ8x zQCc%^-eE;JSP*(8?VoIpWmB?Y-o#m^4&xC##`r_fYpQ${2mm__1WcLue4w7|N|-?K zydMU3kR3<`=+V~|)Z(r=co0w@I4!>&gkOz66%iM}1(NM*H+%kWSTbH3IQ$O*MKYU8 z;gqPa9+ZrHZmLgPf?&+2{k~e{pHmI;&O=UhpP69h54uXAzd@1O*B*8T6*#N9;4`Fp z)}!gWtC2maAMpTx4{O(SK7LKIBuzIjlp6*E?`;1M=Nq`39|dSy^V;+mo;+A3MJ7<7+wK-0+AMR4RPl~WAUM5%otri zDlh=2;(@DKVXN2RqWSk=k^OFQ?N6Q9p%Op@n`Scnqq$K!k$>~PohS#ATU7Z6+B_Sf zquGg#GA{%SV;3HWX@bD6GE(sll)&Eqb_V*%X5HXGI^M&^r)d9M7-Clitsv>;k**Y< zZO!8j2&=mTYyEPL(K=4yL+5D3elrX;i- z-r3dX$PhNW`~icIIM?xU8X~ilrs9{=G8$oFwMcco^+5h+CXqgnRP%Z7&@gL^7lK;A zSZ+Z&RGXxNLAt;|_J}ypp5R#JA+F9!6^X`3BuLYbxF6kqHwSH{Uj-_(0Z@| zQWQWnkOLFr0hm!Rz|i^A5XIxa=>;fzD>H!^tDB?xW!i4#lK)Y;pKG@FgWXN&yQEH` z3l0hsMj7(jozAPlR;SFM5u7!3stbjk$k!--hp8V`cUZ1WmcB#hj=x}k{Uy&G?mWfO z0*ZQ2eNRH}b;g1jZDHZ69S#S_%ozqiu>8wW7hWg+`TZZ$;Kkn~us&BOfG>imi+1c*&T@8F8^l32fH|U8MiTks=mwhd3Zc=Z` zhu(@4FB8MeC9kqS4S-I*s_b^YWgb0%lO=RoeQXl%Vr77JT1ed^U7vx)*Pl_6POW ztp8!`6!6IjzLVfX0g19sn}EQ6DHSqJpFO<9tNIPPxT(Hd>b_)l5hDc@trG$Pm!R>M z-yoZiEAMt-Xdj^`&VloJ05HEVvnJy=D2Ve48sNM(cR)4#8??hF)zK^}*@u6z`{C*Y z?2jZnUAl9*15_BV!heJQ_F0jemvmRbQvp}Fc_-tm9Fv&i#$NkxbuRC4BPfffduC_2nV1H(4H8XjUF`Vv)g~_0i5d!WAL5mcrI_R_sTK{;%I;@Gdsu!hB{3Qk+YdPHuhnabtQ zEurTTNM<;ga-vYfWxDi))QB4THCUmBbC32dhbR2qtQyZaV|)hhoX9$&DKJH6z*yJ6 zdwdEaE5$(*qkf)}8fV(O?qMTc`3V~4T5PM>>711!;Y76EPDT@} zLUKg;A`F37I!VB18w<@oFH3IJ4&VYVAn$RN6IdjyDy|A}bRbc2{u=U>%b&~7JHlND zyv2UNo3)4<0Jl-+8rwR5Gop#T0cMob3h@>{CFEO=<=^jawGM0bCgVa*`SKGG&B(cgs)NCUpkRYJvCz z+Q)vHLn9;IjQA3&%6fF31jF>{Rf|5^Nt+S)hU1Fa56Pz^9*vm5{V+hT9pWYeoR)ye z=^7AhBR5$WZbQL9K?C8_@9pp}N;pDQo`QZc%C0o+7F}xl8HVFYR~uvEZ&ySM(!WM5 z0@!l$GZ|5UTFZ9_ZKto;`9p$RvUrN0Bv7O;0oHkN$~-QtlfcQ3)izQ%E(bYozd;lTdHRuP#YX}KXM;G3^7Hhx;Y-%*7*rH@a~A-+Wg*hax=`aT>gAMd z%DcOte6?SY=vTF|0M2nJEa`O>1z(V?+zkZioEBgr^hXx|cbEv`IY)`RPiE;);Nn!8 z)Z0REx!^}JggMw&%n@%bJ~A%rwYT7bKp=V$_b=XQ2ICXIuf&2sJ{;qxudsLS3GIz6 zAZf7u7g1!ToLP*UI|}m%?*^5(ZH$}=izMt8swF7aj$^``bd&Xk^scorqWuJA0W~ME zd2a95@C63Sz-_idF{bt)p`x9wUzp<-Q^vaCWW%A2yL&xh#Lsc4&sOSbWyRe)%>b9F zH$vY;nZE9j;A;|{*8(9zqcashGFM_8f3Esv{2~eMZ_wAN(UsVu7RT)p1hz(k<%nNzjQ7^2LDcQd}fK{9IdFQ?CWej!H_ z+ptIzZC`k0n{mF?@LsY(Q0cI-@*KUP3TP=lEkB{!z+v?joO`^slUik;Oa28 zxNz}UYE}&$6!ocSCU#d;zR62hMc3R5D{sL~1^E(68H1AFhT%f+&DVpBjzDmy-5 zw)6QFL%?7+(-rmrFceL%WA;Hq@NZE26?0={Y;Se_NNL`i9r**Fw7B^_0Rr*i#}W!h zO2C|!5})57b71Q8fd}Og$xD@UG4BkZ9&4jZQ)$!7?&&`0b{vzXYqAn(Yjw#?M|AF# zp_&)GF964itQe^aJRteM%L@7l*5!EpStt9XK@~70CY}8-{UquOhTuEjJhnVF&WxT| z(S_NLaActgygt32XS>9Tzwfr;K-{zG*PTBUZT;k2Xw5o!vFwrYxfzy=)ToB~_-AiO zK6f`xEa=@2B#yynZ#et2Ra$7cQe>U=^A0k%NTl{-)%#qeCXu6tfEtMP64YIK`|3{Q z*RnGjmfxTjfcJfZ039GsagM5iPnl&4e9HXhgXh%;10gTH76xbH#5iqwNa3yF8)c@XQ7lOR6o7aRE)z2>hNqC1t z@X7d&t<27yU^+w03m%)rWfNKaL@!nz#&Ho2BHGj|p~GW zcfL4?{F)K?if&gS$V0;ETxUGHzPVayS&=PJzF6U|%)3;7rLvSF0MiS~*i*Owz8&+E z=6GS2*S|rb);o!pH$DUBVF0j_Cw(|aHak(~^3{t^L!!_BI?TJ_;kd4a#0%0m&|g13 zKikFm%1BaQQ@i^hzednFM~NcI%D)GN?N%tg98n#edttdlm6t!QKe7Ik=X>u5Fzf?7 zbk#CYK#M62D91v)%!}$=q`Mig z%DezfG^b>p*7Hr?w}9f=>50h>NqAn=y~wuXqcX-_vQ3r+?whm{_Xf1asI7dWJMKpA zM}?enW|BE)Hv!78&@A9g%~D$h>P5ZZp!X#uWJd`*)iVfWQ%VusXPqZJ5SgVgu3RjEw6o zlDI$J*-CzpQ)*^3F)0Ul=O0CyODG-CBSAyipXp34bH&9Rq!P1#(kA4FTp0;Zdq_#o zJzUAI1ingp67inw)i1R_?|349J$!a@tGWLJ)`1S?mlK@Dyf-g_TxE;lMe;m(uu@U4 z&Sfq(&q&A66T@8(tl%}Wyztq5SL=%@QPQ*nZIs`R;E|Y>pt95~(HG##epkjq?!=^*|PSZ{zTpz10C6Xd@Tt_=ANYQ){`31FYTSRZ*wwyW~R z6q>kk_8>p|oJKg2;9261ZpXsbr%ci*1hS|*wOj?`prI}$(SLcMtQ z1bO;A8`*a^sMqOKtozwsNW_P+Cg7@iv$V5*6{-kq#i#)=%t<1BnP0pBfwF6h+PiDjyhu>2+4Vj5lhl!HAZL*tzqhr92(SUkQGTRjVu zXm`}4hCQuAa(gp6yr+Oz;-!5LE)1H9#Q=l4nW`R!Kz&RZgvx|I#l^Gt@UaYBdboij zU-W_eC@J^fU*5C~*x#Mh55Gs+aL|jz%1>z@BPM*pykxsnkl}BFyONLos@+n51=+(B=|&6C-A2q`G0^{_|%Y zr@8)xf`fjs6QrsjJD4ec$=FPh(`?K%q zY5tjJ6Dgl!2v(Fdtw}d1y~yH%Vjkg0Gw%L>m9Sv!3~bs@ZwWE|)ryM$FI!P(F5i7W z0XBiJ)^9wwY_y%EH|1TrC{pv(Fp~1Ceig(pkZ9c~z9_HZCUvRfU&NwnQA)&EUHqM! zuxJ(VYDrL$>PT3zyNu;V62w5590$)y|*AG$5 z5lSUt?;dmsguTdrD?U;yjjy#1iv)t$x&j7a&ck6Zk)MeJxH4R;S^(mtyj@NS2tcm3 zRitmKOocMWd1m6_;^T(x>!S20)xYC2#;dO@D*>dY@?dL_@928iZp5fEvCrr9QJ)TN z-^JJ&2a5)CoOMJn;EODFh2r^CX&aU?7>RV)G;+ZsB)_uxS9ozE`qBI zOP-5bVO#F$cLzltg4kP8qLg$|3WjXYDY*EbpaSH}ir!Uzc!P&I& zM%y$@Z=o8rS*-e-ug?|3Rd@|pPC@xDcsBix^8*vM+f z+_oUQ?}3YXncPYZ{{Af$OPPQk>2<4tg(F1OyX$9HX1q9Q)#kt*Q#lUvkrx%qhbvW< zxL82l=I#6;_|fPGxc;H|=z}lT?<*{sb)O<+>Bm}5Lsb>-6VC5;XB7o|3h)nr$wT@~ z&gnP-jD5JNL*2C5Fyis2urtHPqx)W$@Hq?o=5+-YQj6eVb^5WiVk2z|Q50en7WB&= z6Zofrx5#3}Hhp4i3FoxclQF?^&+cq>X%$!;Ob!570TRM>(Rl++;c5FW%w$R6qW%lP{ zc*Of?*B=Xjn$`?l_&7!N;l@3mBA?-U6vkx88jg>xe#6z(x~sc#%|0j+Xx4&7xpz|D z53jC60b5dz{K0~K;&ecgy7GV&4@)pid$;rQR}lrwEEH+jdwPv_ej|E(8=efv+y zs%1?Gp9T(H@1z0b`I_wv`g6>!!sxZk;TlN(XKsuycDKLW_Wriew^{P~l?~eN%iib> zfvw~ovg5dlOU1Icip)_hMc6XB)XWhCEagXK7~h?_ung?Y&#KU~@z+fQ&q_pjB^pSg zOv+gkM40Zs&S>nCxRiZLMt-8c>#l#|tD5=F-VhvNQk-@KeX7oL+k8G9t&V#}p>!76 zYDV>$X>mS!!oGak?4gm^z)TNWjkj}Q4ELg{pZut}M1Bg1pZcfatyX3Qmp5KZ1zJO$ z^)cL{LnfZ}p=OT-kzM5dAb!un zvzE!45?>5A>Y>9km5tqo_w{pp#y@~S*E8D!S&N3-&8C>CCx&%4zmAVPC zpQ1hblG~k`d?vEi5IZ1T+5qe?|25Jk>)jaw0bzx(MuC=N6>SGHss|9SzGcDFsjD-d zA7hA$P{y`hPdD)gp-B>zKca{4H5qskg!Q^)S|K+idAw04or_duMJ#WtDEyIwA<@h7 zikf6Doxmq)DeP>`AovaX0^cQ59nSmEJDeqD-`LfY6q}ZJxiVF^b+eHCP<`Wafodo6 zs~>ggBKa@9<=O8M3mIDlzdJMQdX z0W~H`z=ZM~pidyYBq$pQc$={&b>eXKMHeup;nQ1_BL+obvCfko?s;{Zh2QZV-5h^= z!Qyc=CH{2Cc=n0Q@pAzhjYIv;{2z_3$BnW_sk^D^v^$S}CQu|DGQg4F4b*7v;v5%? zMGzPt&6&}qmFHQ@F$IE|Zy&pKUO+;2uzsz{&s5X<kiW zx{-RLuGadoKh6^Tf<&{t;nVQ;eADn2TXkctuI`G)^K$rpqptY@-k%cN_!%D1`z6ozAQPn z0KlE$g~+o#0AG<+-W|X(HTpR#G+5?rO>Q`C+77KCPb)0GDYOw#!+Q}f38=!$fE;)L zEdv!k79&GQijBMsSTrcxW{rogrG6k{XL0=OjU)cTvqMRAJK)Ao0v!~W%QV(= ze^=AVbE%j2-gW^0;U?L8SVRc@8=lRe&S?fEV*SItR_d1XHpZ_RMOEK9)P{Ljs)wKR zIP&CZ6oC1;xPm4?oW^7W88_-P=)7OZl6)(iaxz^y{yCyIGY1xTwVWp-R4tm#OXGKD zADf`3Sz+EZKBA{kHz$Grx&jXhz2D3msq&-y@BvO`$L_7y`+g*tuol+E@N2dq=jYT_ zyktu8WUHU0NLy5IQ2a+XvXNT^67PUe@k(gdrgBNu?$@ik^$%}jKjA06SF3fwU*=0siX?p6nBK6;-LPXD_LAm+(GUpQ4^)eVQ)fZvlX9S0>VLJ98bo z(!)}(4$f*U%3$2c!`b(YvgT1#7i{o9l1)HEj zL+DO$2(Ah4PH=aZ;7-usjRZ*twKIsTJlTrAY%Hl{UZcO`+@(7!nY+?F??ryT$|7xTD2r*cjk=+0bbJT` zKnO`eG|O*3d^5}fg6jvU+Ea3$`5DvrRKpA^Dndcdp5$t+uRJC-PL^74OY3NGF);Lu zp&x7s9wArl1P<9z@(3fd6T($xokWZeYzsZD47GX*X;*OHY2aIJWc+;XQn520j zl-omsWE!m>6-={jJ}9Tgidr&-`GprssKa^5zh%_Jv@Hzfoin7!G=OyzY2VflZbyDAe%F;%!0@XhCVr&|`y*-k9qfS` zPPhGOEtMmd7T@eW+Phz(Whnv%cRX0)y%tdH<%JzT(t1*}vC5YoWCF!-HfTQs)8j3D1 z%qcX9dVl%(@eV@w%AY|LHCn@7m ziP$G>=B{;Z?))bgBJe~NOlN1DDRnmc^`QPS1 zH!hczCrL}T?y0zy@t^fL^@>*VLK8wv#UFR?fm60tAPPSl{Y-{;0S)_bN;P~5f5K&d zz9NauCy?#CeR-D^?#xY!P34&ziBB1;lg%T&uKIm@jBeu*Vkt9Odf}@%q%fJuZnz|0 zE*8|p+9%}iMq7?_SynFuzGt5hBHO{u#?AR&K@sHUx69r1LE~{3dr+7-8P0`!axaW^ z?urRsvfp$_6b!3bxJd==I^st95=d`l>x)1YN07s zGCQpyvRdx@@Fjr1P{tqifX#QC+N%9f(Y9f6-1gwP2>sM+ zcD6!}>7%7h(G#Uj-DQZthkTE_?z3fsA&Q;yCg$L8lyxhWLA^hJy41PXBL{@z2){O|CS+y9jzdP^A1h1BegX7Jb( zLz~28ktvk*fD!!`W19Z0Hj&8fp_Y*D5h4_fURw)Zq?ip&!jV27jXQct5fTO)$=&s! z3TFu_4Ran%h-)}3eI3(Dr^^66`p&VLe%9if{Z+7xE4#*JP=-j%^`y4Lid!B|tBV5E zX0AW*OvsotFpGpiuhw#ESmf?}>@I(&H=nXQPu=lNwl4l8y zUAKfs}g8b4KdUWnxNuP z<9w1(nmy;7j@WNa5ls?b6qC64}qIv58tHIE( z2bpAicS37-nrr5gc*u6d#fzh2}6%R|}>F1lSN)D1{&K9k5oie?;x zp$&5HM7D}aL51rVMa&b0Nc|Dtr}LErZDbnJBXgBhv&yg4TwKOnRNv*rnYfY^C=EG2 zP7+Q|7~^(q5g!V;?DPmG1lLd3Xs6#eHIdlzYzwoDShHCCO#R7_kCg0Q@|-FdeIps& zHXwYemmJ%MSIZs5A?P2B;!(uaq^QxrNfs~C?_k~Ij?D*`K{PcJM?5r?%Hg?Bmu$!Co>XH}RISiw6Kxm6r&rHHkqGU!KU(^(P?w00q{qH1hlBgSdc z0JM_p;r(;J1Z_w*2Iq_kR@6T~J$z5Uzbf>WK#*Nwek6Pwo*iA<=P4@;3m|lMwq8Jw zvSDHE*edN$iuoF!mQYYaL^mS+GS!*>shLpLQcgC1w7)f$HeDr+`u)!y5m6Ew!U~zi zM1(WEw@}VeL3TOEsR#O*WP?-o5v{kci`(8W8oTx~WDpZ;CJeti2?}Hg@-Kmn^T&2a z8fRG_D6unQK562%NV4#g0pKE?= z4v`n8_|S(V_*{?qFP>AfR2SWrJ@ho(VlGBQXi+gy(16WSE6-Y7(>n%K zl*)K}NtBu`UK9QMmLlGGl0HHDY{vNfJifIg?Hbd!0OjGYrKbNiuA_tnO1_oANuwp6Iwnfvfh@u4({L9{F`U~Nm08l~{;4skjoX@zwq zyCN%e+mt# zmH#MUCmb9~{)h^>rCudbw@n3)*BGO{oO!>va&a@peUTA9^WU=&ei`_Ad0ah89YLUT zCRaejBmIb)<2N9Ds!=Aqk0tyV8Q9uU7S(2!cEd26c8LO1w^E`;#$)XO&MM+#dMEE8 z_lP?h7Hjij{9y7To0~E6GfR7dZ`EntbcE+zV<(el>D#d5Q%(U*oyky?XgQ z;Dk2+vXiza4XZ4;+U_>;Thg7ff1-s1e>B3Z^9%e7%8Axjss!Tn{gmXBV#3lKLwmkT ziZ!j3(Q@`K&qwfrq6fRkA0?ZAgD!(kRF2EKm?aK2en9ha%JrVT7neq}AUn+6x((1T z+fP|5J5WMiy!P50XdKUeSlSw}^~7K^{UPnvXZ$i(!ZYeXWi6(}d^7b!%*w#MxEtvk&O8)>3q4mV+J|#F?s~=@ zqc_*$U1!%tJ$#12w)KMNdw6^o(zi{I==cvJWn(FP-?8{Q=Lf&DLTZ|A!B%_>`R!!U z8DSMhOKmo`?_WP*Ze!YBIFs%TPN0cg1ut<~4!6Hz%<=4v@vZp_>w`PX)CJ!$#ESjq7 z>Laa7ru@w{jNLa*zO#s7`r`odqO4U-KvkUjnNvX$Y4B=J*D zP2;lR3+;%mc=uooC>ChFkw0oPVYs)u4yd5<_1eVDGdDXglHNhgl)luF701yTn;cEzqvk~}_$GTPE z*kCF6HwbO>pt!QxSwbMLcZi5GWB@VMKT^;p?VkNhB+!MVGywRPSm;Ti@M?1Nhu-_SFaqVrIX*1$5TRy-MY3#gn)<4W5jhaD{Z^94%o3DBLlV-=ua)6H_3yijB z0PARM?*}8`wPlNQLsj-0^bW)E0oz}Q|3AP&8ppxD%g5}GzQ9W5ZBG7JQwbQE3^isTi`B5HB*7|%DtF>O5=re# z1R=B1(KIcE4sw=^P5lv0TXH)w^~j^n*k~yJoR1Xux`3~&PRRvEX!A5{%sUIjLH4m) zX*A&_$~nRl1INpxS(=!T;B@PmitSC?AEfS-Kb6x_ACYX^H$EilP$w+On3)_=R#>ZHh4~L(F0h)f8^^f#usAD#;cc6I!o7b0 zc(5aKC2&U877awcIHkUae4qRc^2IT>8pD3MR@(4x%_yw#uqNvZpYMK*46etpofD?b z54C>>ipd-=HeSN*~jS%VfvedbL9?%bNg0Td28Po{^Gy{(^$7C!$^A(=WDJZXIM| zIQ8U_SYA^5;{c;5PUbhL^d;FX(OgV}02n9y4+89!iRKZXLE%yj>s#<^&8Rov}jxTWyl|%xl#(Axl4>h@+#8Yh~u+ zgZv;jCo4)fq|MyGIO2DFAgsA15C^D1^av-k2eeQ0?#*W2+Mp(3HID>LhZ{rpGpkw= zp#$c}X%9Yfzd@PQC64nhk8)gjzd?qH%ePO9)BKlDb$n=lgN&X7XY~P1;At7T2EdlB2HC+~`8>wOExm+luA1)1uaNjgfL?%~Y{vl9lB}?m;o{9Wxn)vQ{Ic zC0h)CYOX*xDEc5`LSK)M4zN}tl#!>XTE{mJ-;fXbe#kKIn?;>(5f`83DDmqz_yv#1 z&9H-RmQ)u}Jh0*D-fu4=MG)}Bc$f*WJxHl&xf-Y`k6j?QjXOFpy;0v#aJZAX6#VK< z4AVqBcZQun{oC;MdW+}D!V>A4#laTR8=u_!+=B`buMM2p*hf_sajddg%D%Tj05o?2 zc)aHn_kamc&=01yZ3I|%ZrINP!J5_$7?!DU#Cc)Uwi}{*;9eNA02jQstcY<_c<$&C z;MfBWWn(eA1))fK@gkZE9trosI9& z+e`ZKJyh39%KA91XT}fZ1ZG=zYu6YJTawOz(Q43BLT*l%V8mYc(+6qCQ+&_|boigx zYxwH;Z1uA4#!z{B)QeF}Qaz_;4=k74y6!g&OW>AlT$!n+d+1sZRA!fnOI ziZk3)YmKg~mG#YY1{~p;=5Yt}3fJaZlP@oZtGc_c4baSmze4t*|8OP+aegC@{IRr3 z{31m0;SZIHK!6<#AT!Y$q^*N#;Ol@{?j>zE+ZKo(B^@z<>zUH`7)s(Rn%8Mx%$)An z0+tOWZAZU>UIM{5CaLo`=wcgq8Z-K2FmDd+AJxJF*q({`v&kF;T$GKcVOhVpMGZYa zpc#NL@2^NJ)2(mfkFL+O2>`+2=BYMt^kJi0{IUsjJ5j)E_>Tkm-`t!2i@gI?y3cw$ zU+CQl^jA;mAYi#4N`~Wx_c-;(3AG6M@l5Gn?u@1THweWq>Qu-r9+E`yWz^@?Gq*UK z)HCb*z+GnR<6f4KVOR5nt(II@xAq0npBfhrTFEeyGb9-fhNjXVrQ139R&uUvpBBa{ z5`5>=w3Y4Nf={yUi3N&m{HU_Ji?r$K+&~|mSPsO|PutO*h#23VuS$(&;LD;9{q7>A zG9n}z1jixQ?1nhBuS`+oHEecHaiG!l@TkCyx#G`z=t!*T5O8E=oxmpkcmz3!Gx6at zMZo(Mu&M`iQ8E*M9E(}y!pDI(l|Obdq7myK{;E^k> zqfDk-)_j;ZAhe-Ut#L+SCuzP!|R8SYt;1`O#}C9T&d;4;XEk`($|g^S%iu*h#kQ~QV`n3%Mxc_^{fZ;Yu9b<-tr zIO7vOE3PYmtJTA0>8 zQOP10h9n@|jZ9>{X`-5ZA@4K~IM^wh*jBb=OQ}b12?WL(d3d5Tur-hjkI|L=- zDMgW9dJc~Mq4K-)C;aCH*hB#X*{6{>6rWirj8*DrWqthl991r|$zD67(5rhY1&s$w z_lnIq9Vu$E>K|U^^8BFPQ;K&8$DR2idckEzRapN^LiwA`w5KO_iO%ZS_PI~b zm7-LOof6AwfALcU>Z(008IDh>8+bBXoVd~8NdwtbDe%W|r=`N%usR3XpwyHi$&vf< zYV;A!8?a_@cpwCoV1(B`J#n3r%1Tl%&nEpN2h~-PZqL3;K8z`uRFPHZumhx`Mqr&k zKPy5RuC7=wG|d*htZRM#hurfjGvNz3;oCcaQHT0he|A@2<9{KJBAhB6;I2Co&UtLB zjuC0V`=rof^{Bt%5@ee1eit$kM=?Yrd!uMO=!39EFYt2hf!(k=;;XLQx6G&@CV8&L zyI5>V?|>a6^~VV#9O{z~A#2m;Ve;^q*y16=@S1MaFX3M0?dGKEQEO$9ZBKB-&^o|l zQR5>-YE?7UMFw*6DX*DIv!3@b5QnH~F|I}hr$O9CLnrZ>1DCBTg>LLDOt|8vGaXlG z3-HNrpST#_TOYavNHc??=KJ=zfI>2TQ%Xprwk{6)|T=Rvms18Ymlk6Ff8VN*NCf8pEr-WBbaLDR=9B`35mkgS5s+wSMqXK{Uts6 z`w04Pdl%O0x_0(8H*2b%GS+u7HwUU5)DfrUx-4+8lBP&FEo1qSTem&4yWk%o#Kp1r!ke?}7q1jFH2r>1s8HLF^K!Y@5$R2o z0FGOpM1>oj(!8CS!;|6k6}&kXYNd*@;Md0RiCu5K`c#a3Dy!U2y@cFqfq9FB&!WbC zU#eBlR3|CODSWf6YW!pllMrBwiP5rMT?S~*FWk~XD~Z&@Z3w|pt#b7EA>pYdc&@5S zRL?{&so=M7-d^9W2+A%B_GUQcDuh>a*Y4*)SL7O$Ruo|VsIwc%Q~LdG0wA{?L; zgNsadNIQk;=9?`d&35Dk+ z2z4Xsao!xPVDTGF3z+83dq}RWP3nBB%^Ak-HxUuFaBgeoi}nz}UTYJnzNQA4=z(in zat)agDu-0)gqn!RVag!dG7!4_(_er0_W4Kjpe6Pn5r$ug>y3A9E0*^$02l%kAwq99 zjnK4y0`Q6OM%1;`W5)@U!A|`P<=b#IN>+-<*uUP? z?h+ikK0jS+!|txWCNh43?L%?ej6=K0r3>DzJp zjN3*LX(W`hu6JcP9^BvM2m&JaRGi8OQzl+bA@(zg@ciN(d6g=4LaQIu-S*5!ym+DZ z&bz4gRPSy)9XI1%1L5Ab_yI1wQE4rF(YgK`BmqC&B#(*@$L1+h`H$N(fECOCl#M~f zJ|yrI!0Tp1{#1_kK;@{}CeC;^+MaeQ=;8R->Ano4?&oYFfM!({aV_nCTG89;T_yt< z#7F^9ujJG{2HMiDaNj2y0f(RMZo8$>Ee#afFXa1Pk^3w0jc2iup7wm+QsSC2Oc+nI zRAsX+k!RB?WvsA2{rSg8v{7R3+3oFajuRxqH&aaa=H4HCi*$zj%^IBB3j_p=kMp~b zklPRDvn_uLVV}5vUMCTOOa5@U1`m$#pu@ukV9#XpEr zr1L0gF35<;$Nt){4aY~1Bqe!5A!n)Evf5hF z0K{W8cQr5Faftr`DXoyxDBKV z+ecjrPoQ|$fDUWoV65D>e(URRkTcyJf7~@Djx*yqdI*A1iXS6Tg4P)UFi^_uW2Dyu z_`gRG&~U6C0Q7Rh@#M{eM;9;(2wRKG01)sW5d?Y`^lQAerLZN^>!dXWfh;Ddt?k!J z7IeA0cByhEse7cz>8qa}!Us~vw+1dUp*@6eQ53`#!gHpaN_B00wuQ1Y25w(m=K7zM z{n+b?_K;gss9NqfEuYG|Ln^sv*dH{txWr^lT^QJb-?_JOwrVL37Yn7su5g0e$ZBvf ze@Nt97tZ?{DFi-O;VsFyWO!7E61?f(sWu05OQ(!o&sHds79Dj{-L)C%e4@ zb%}N`oy_Ho5r%5n#4!D}kJ;Gu=G`80^UNVjDQ?Kq#aXT{Vyok!`IJFMsY~=H*m-Lu z?%Ue44hlW{+XTO2(t#}d5lp~Cx{(|!bME*fP3nX+WBu#a*ab$5@Y`yq*SX_D6ko^> zqTV6v%mu`S2M-_-o6q4?F>Xfv1Dv2_#N;=(e9G=&^w@kTe9p5&EjF@*6vFzGpT? zqa%qwv8S=z=m`9vwe{}PBjy9yhR-UDUv(X(RN}jO_A1h5n3xj16sFqdy1d^bG};WXZ$(|Bb`&vYXzFPMeAxqybY3elaJ9azT1SXAy8!KXcNN} zvY&G4#VVZQ?TEaaFQ^VzdZ)0o1bDID$Ss7H5QUbgFfCQ1IO_6_Ie)o|Qjux4EKvbW zRGs9KdF2)Yep!nW^r*<7Cr`p9X~`$IMF)pI`g?7~y}u|V0_;^^zu)BG#_kP#&$}|? zw}!==W3g)m>|=XN9D`<%)aqayn z;eCx!c6qg)B{4d>(cN7;4>pE9%!46u;A%~YU|?(_979VW>~0W9+wHh2?uac`SmFlT z@m7*zTw*S4Grzn7pBB_QI8=ugp%~jnbS1-PnX_2JvRclD2ttZmu z;ChS8bDv|BLoAzLAB^*&7a>TI{7$k~6a%>kG7wp%REnQHh7HJiJS)>LR-So4vfU{p zO#3&~aputLO;H7=?9ohE=L2C(>kL*)h(}41B$MiM7BZ7z03ccp|~+sxGoE zR!^+2-x~Fnbq|5jN;-HS-O`+{CQ6sO)rLlMZ0gJeKPj?c+i=@$r^e6uj@r-~H4>;N+y8SdK zZt1em7k8y8Axo081J<;ZPBH=+2HC)AvUUnVkyAUuQ*YLl)dzk=FlJh)h7%}lYR;^h zhd)u=&5v~o*^wTK)XL$w&G1T0L+0hpCyJ5RL0hXX$%>TzfII3{l*_1{4t_<{katsm zml>;^sSm}HM$c(tAyFboM^n1Wc1+J!HnTr zoUVxD1~y371!kL6M95NvsK;s?F_u0n;qF`!ns;h zAiqcvua(wD35GD{t&dD|g=rp7WTon+)$d0S0;0U`_$7~_A?H$WvVQRyeBO57^Hvv6 zGemY6cS;WJQUU?BG|suWX?B*3(`&-rQ)PjL5R#{q=>$9~?e}|Sz?wQ_Ev;I(^V_|t zwuUb`e`2W4QN4R%VSnuAd!_Oqem8u`d3W8vM_?GD!zuGnNDqt7pGvF zF^llLz-ZIv0e{s&`1Ifj&U{%B~BWEr>4GyDSbKAqYd){~i$S3Aa+_v1M z*w(DKr;z2+>8bnS)k65)Y}Mo7Bi`fu$#CPr@Vnx3vq8&@dM9A{w^hsOUb9(aj)J`o zJ&UGw@0w_p>{{H9#Ck@p7f-o?%&z(3%m~$5@UnCo5bsW!@bCVaj$4%<3aS;1RIcN! zg^~bLayb?o6`Ssl0fvw>YM#LYf(@r5Bdu`}GL&nKE<^I5J|U7sD9)bKTlh!JQ<*&Z z!mMEes)TmZt=3bZOk?bSWV=LPV>RYRo8IT@|o~C|W=e>ARKArG~;abmmtH7Z$f+xjB6;p?- zZ$_41YI3$Z8}!D3(7O%&my7&QgZ(V3M;>6D-Twx8K4<+#qV5$3+ZVt%-s@iChQHYt z zKXqL_tdk>_UO6iZp`ZK|%RT-tvnpA*iWer0E-6tMd19h5wug;kH4p^0X7weR*l`} z2d}}-d00e<32uU6JeN04kG0(_xdt8J40lY zSBf%L)}w9D^ZiAU%t)t4Yzz^OBMY1zILR=rX4PVXsA^TJ{LIwq$x;{W55Dh9yzm&s zFzh(~vQk^Mj?w?dZNWBzu`3IElIYP;3l5fc>^0~C^c*MfT+Xp$;r9Zz#bvHT} z$=F&_728NvdCe|O z{Y~OWM(Q_yi&Eoz;-b<}KJKR1WKmP6kvKo7qZ|#$5DG~47+0X7oC`*uQ}G*`#%jE6 zt@2{B;obzMZ&3(PnFc`{0oSt&V6}Q=&$M2pkGIfn#J7J$^3zR4^+6^j zKM^uW7KUXZs2d|lw4X%<31cv|lI%*>L{Z2MtSq8T^JDF~jwun^8Sda!r zkIA?q48At%R$41@T0cIlJ!#Ew-rdQQ9~?9{L3{OH6iI72UKw@RsL~>$a3_`F1hX+U zA%CuqI$k!y&6es_m^&3#yJdb^o)yp>1`P?Aet1t*?>h_yF+8k? z;+L#sD{z+eQ;tK_tp_%l%|*9EqJ`Xesq_RIk42S)zhIt zlgmYz$L1rvwEzO<)#N?%8f*TKxYZFik^1o1dAk)5@pG94ql|x2fBNtF!2esH`}ZnJ z{7)6tr2Fq(he83KR*9Q&+YD%$J;AMT|4r)`j`(z~CCaV0MO@48r(JrqYwt=l zNP3t<5qdx?$}V-_rGx^Y7n$O@dk!LvKW-mN5v=@v8=_nF%?X{}ngJc14&2b5frg_i zs|k*0L6_B~$COu*1NyA=yNc(OO-4VtY?)>jUT>1mxQ#TZB^eb(BJh4H#Jd?4nx>y3VVk;g3a(aI<)Y}d<(#(v|2jc?mzcG?A+q{zG zXqPz>k``;qv+|2jb2JTW%o{b@KjA8jsk`2I!OZ3B=CO6((kY?gm9{iDXDhYIrDIgzxvL= z^}Z_-i-Tp9rf_4fmZ~2=_oiLuZnTl1NuISDo8ei>tL?Mw%{gnlA^s8a(`(S=RvNV5 z=J_7sfufRE&{7_}M|U$MKgD}825?ngSC|*mh5RZYa2k>1n^Dki3I-F>fznmvM4m6Y zN|rp;ZcPD4%u@N~=7$U5AJWfMMnvEtgdFL*(WY zb;8h5WCV*=%04dvOw5n7YuI^3lA=}OmvREcK}hDcfL~0a<2A%r>rQkJ9vOGMdpIjz z+1fPaCVokMNBo9TEFAj=dpst%AN6jSaouS(#a9Gl^`{*|JEB4o;0cuLzx#-Et@QZ( z=##E}^VkjC?bDsxmgqu_&|2D6nj>mH)(-40v)Sd+P$3{=EH?@9(Bn|HUAue@i0&tGxdYATa1gs*TanU%|Wa{of!k)4xMt z%Rq~6+ar~;S4+hmb?TSx4-ag0D z_-@OafNPC|k{FwwVJ*594U+|_jZl-&W%#Q|T4l4+|C!~1X!hrY*8+GRf^xvcTvxec zPk_$gvY^Ve_o{@>ag|9ydfz@x_|q$TsYD9yfK0xMBj;kEizux zhi{)3p8bTne-7~GF!s_wD;-5>OVGokKN9+kXQ6xE zYMMSDmN^ooN{*A80vsEwi>bjXsw|l>ERYaRn`#{dOy-v(0RPh%s^9Fc_nT5&BFFs5 zq3#^mnPJGau!|RB76hxurJ(m#*Wd`E$b6Ekmq79fH?X^_A^$Nx-ZlGY-@DySFSR=Q zJ$O_Jc+yIfhF?isrq4x1*=h^&Nluc_0zUDag-_i#KkHj$holu?YypRJkD}Iyi>$%j zQA(Aj)E!!O`fu<#wzjG#q$9$xz?Vj{=4R8MMgLeJ#?s6?&5bkI^O7*|eIv%Lxpgm8 z-0k$Tk;Bzd##sXw4wt}62bF;nNHpYy*ccEnyED^{^y!+#kW-zhFk)wfaWsr-7A}ft zNb)65j%$}A+t8Tz8QMtGFs-nzB}{jA8M&$k<|U7_&dD53Q`KvY>Xxt$ZA>JF*{Xcw zh*!~h0cepBF~=pqXaE3WO_v2|p#8DX z(EeAv9{+7n{{Qd)=L7mLU5kIMk8N&V?Jp^tviahLXz*4ke;jitx0(#hB@k7RzNJVN za7=Q~^v7AXDVAgys#RVqie~MxWXq(cW`-am@Me|JSc_)8e-6_{YX_1)z}WWBhyZZ} zMxIxLT+@|3v(ph<2ztQiAXS%{*u(%4G_P?*8(C0w7B0H=T7t}JOF_1i-nS4da07f}JP>TKz^a7ZW4d%eqg7D+Ux&r(0b1vP2z3G5RQ?mWxU?J^D^zfM zt#;)T8+xHlnQ)2+2JX9rqPY8)y;UrBR; zFrUwh>Lyh-^DpaFCqCp6if{$V$pGzfshI%_bl@VP)1{BG6sVkac^vjH{Yndb9sZSNJ)b`*Cxk){**O2+srwmea z6SV2^iKl$MCi0fRaZ^l+eFAWpVP+1cAOpSMaZzG-pHoVH4#(e!O@CeSOc?SLbY&+Q z36Ltg+^AWYo+rMZ zb}x=f7qjK4lf#~(>lNS=BN`_BZ$6)Y&PnM%|9$@(j`#QKRrqfS&_7nMt}8(G3Z$xj z7WjYbP&u+Ol_XXsXv6oE^QvI(CdQ_xo(QKzvxw#Om`z4~P6bot7iFHc%$y@>sT?a2 zCs}V*+&(b6G3p__m|;MGutaA#|MLjX`jf8;ke75oXc=SW$%143d366EjzF)=KRL}` zZE}&B!63pvDUq%$T{;G1x8Ia6@|*iAaLft$sO;jfct;O;o>CA!`GzXG$Qxa=MG*b@ z0M5Y3bD)Y9&jH61_&Og@%i=_X|EzB{_TL<`s*p+_(;J zqaspTLdo<%jn{j>Rw%os*dr?&&j2_09KMy6kl;+nVEU|M-bER%pl!X#ucDf*qb$=t zLaz?+HH9=8qsP^J{LiyH1CoLCfg&^K#+R=1_OE|)Z-_vcgk}X5<<>lNc=!zJcE{HF z-)JYI;t6f`l!5QwQ)#?{I4qIzK1I67&(@Hn6oA#1%DOYEeKs@6ISoJ#$VC=k%1X#A z{YC6f!d-XT)0MH3HMe0Wem()&fPr-x||p?OA3 z=#B@zkpO=~n##LDir|M;aM2Ug^@GB@XBnDj3^WsDKAr~wT9h-k*)f>38 z{rLrcxn z|3ZO-Ewzq%9~%YmAm%9c05%b)4>}+E-QNFTqm?{cdthY$t~g^x7d>$xjtLHLS%V*{ z$GOK@hm*bm8Xx`?}vN6WCXgN zSWu__oSJ``rvIa%PRF$*Pn&N5=RYQcQm+!>%N-L_v{V zfUnghTupnQmOmizMzo*6L9o2~qDOecf2GYl`a3n$?mhbzed` z5`nPWzysrq?ONK8xJS9yUjPYCo$xKPZ;66GkRtoj{fRIC#EWq)YToiuj^>Y)>d)_d zT2{07kDGG+`>qK9w(I9k=*2m(4~^_yYwNTBZqr{xu77^o{#T~@|BpUX%D7H@A2%ZR z@C$lwGi&<~ZAX(|-t`D;+{^(eat2!c3B^Nu(gJkh6AXA-0;zmF=Lue&G_uVOvuMrB)kLz=PD^dy4ISZ#L}% z*w;pL-#1jsu9uAEVg3bM7rB!z+y{&LYkxoBl!duNuM>`b^ujTp;EQ*o#jV6 z&lP>Fv7StQk@WQ8SN0>h9yXnM16o#`>VV@WuiBLA53=iTG`>owoOLb^PZz;>V%CCU z^HyfUNer9is5N45Kano6*!=vip~zGCdL-@ZiSBd%@Hes&%KHVAG+#V``A9y-A%S&I z8Z85vR6;0iRhe5&>qwpXz8-jD(9@+jaZD07%3>S@*;ReNW@9j(I|z?%sdCqAxc3caam4rz2qomD z(^=)BiT;#c(7IS)fJ5dwl%sP{u?7N)94q@(l zdkYPg{gE&gzkr{m=`;kiuMYFq@tWzbKnbC&CIdQt0YQ##pIz$(oJJQ?>(e0YF-e1c zC))8O+kGL`hQ=>>noatfBr13-IFq%3jPbmP^=(;GJt%D=X9}dJrw>g{U+uJGQvb3MV=RYTZybc{ts9`+k_JOe;cl;f~DbKQ<8) z$^CeN(KyvoLr*~a^+haB%eC@v(5}{=4C->hZs?e+N_EwNk#4NEEZ3ql-_)%Lmgi`} zqh#q)Ct@7TW&!cg41Z4Ty|+dIKFDQk+E8gCN%Ofeinm%UJEc7$+&LPwFjHn7ee;C? zG1o<0hOwM|1&IUZtxg-KMMLCB+ov#cKcXh=F(nuy=d2?<#Vo|{e(KZ7Gs4+-%X2&P zvk|32Jm5;|6r!tdfSDs{4m!`K+e-;0D0JCa zSQs@&p$Xz-S{bX3{W;pIqzO#%gDaJ;crjs6`qt}DOAV!xe+3@kB?~9}KiGS#pt`z! zPZRAEcRq6LnyNH@6#NFVIa|@XB&ae8Qmzut3;u0CTu^hs>TbnH( zBS&vly69VlC)$qS%5fo(O52o^C5wt|@=!-@qG{mMYsBmul|iZ|YM0864zl}-iT)1k zss(0YwWd$9Sp4k6e2!qT*vxIjc2rYweN^Zq>iNkFV}UARqZ^wY9xBh2?KD_&}Be2|K6=@&KuDc1w-4wL$r#xXUkhe~kI z3`Skgi>X&~#4@uyV=^oP5;d8RkGctdxFeYF;}>C%Y?&6gA-@LtHIl+vZq^bDrFw*A zTR)AQ^hr#51w^~KAypPYr}W;}r)Zx#S{~H!QZ3aPN8Nw*!e~Z3)a$#r+P({JlXJig zs(*W+9TC9z{fy|1=B*ar&j}Maeu6PqmvCVO*X5r1>CfWkHl<7bC<2=Nob+(i58q(H z;wzq|BS@%(nr@{`=oukYk4gq$84Kbe=$NVCuFH3&+S73^-*k4H$)!FDsuuI%gfJefWFz+HQ}eT>JVrn#CY2wzF!ro{32cw} zA@s%mu_EJpo`nXHSFsgM{!J}>>Kl&cH!wc7QU{{X7BNxbn>OJ_7vRWpr-!oLb}7mv zDtK$GsGEL!*RA#eC{a)kG*AeD=3nJjraKCOloxKOXRc)#Y;5w>zxq+~V&+o+7TId? zzE>jy1BzT(Wk4d7^B2t40o*Y-Kp#8nKGvvMciBAtaBEmCFiA^o1iQ7i&*WL>z8TaL zsop()F4o*ByQrcU^iI*&#LDM&D|1lnyCV?FO zLcvbiYYs{#r?5He)-BnxFVUpM3mZon6Av$w~{8 z+G@{41<9sGT1|0FXE>R3q=ckma1U7e={&d5CqtEJmYd>{tqrEr##TZDHJkB&!H^g~ zJIG}T>G_GkXTsz(=|@5-h4Jt8E7p6Lt@S`P%iEPH3`m6+tMMmjAt{Oqg zOgq$yAtmiN25RkW%1u-CvO>zEkh;auWR8t(8YI3)5~Bl)GOBC0u+n^evEsv| z19o&J=%U+(4j9ALi&r2o@q8Bs*!GJ*PRzDqLC@I+K4cb=5y$+&#Cl7`VIec+~D;BHSPj->hkd%q7 zzDQ#a!}~z1i@P3>aX;ZV9JMX9Ek*akrE3MxL3yEfDXFli#_puT8Y+#^dSZwv{9P}PyBoYUFtM1|# z)X>-CdD8p~W{A$_@h_M^2`GoKqr9i$g(Yzw3`GVGf5Cv17HTXy_^0X=UTfXQ>ov=w z$o?=&#ERNCaa^Gi;Wz1bB@5~VXSh(Y74#Mi>ypBHTyfe(0$8MLd0T`HE1ieLVhs zR548`@QAImVjSz*1U>k?euV^zL3n@^t3}MeB76!p}6VFL< z3efgHo7-D`H4jR_fupu@{sgz=eUMn0=xq*pU`_={c3`JFpbg*x0k*9qwm+7f8Q?++ z)7j&_*nhf2fMBbS>qU0}NZL*|c#G~@B7cs)V`oPnSsfB&j8>VwRP->q&-ik2Cj|rF zvcRhP5;wtjdj*<;eSSo~Fa11sgXXQ#GRG5M*wWc-H7&EARg5H#6rUnEKJ>&!5E}1 z7t0EUy@3F$6l7>uIttwS*)Aesocm%Y{*$k{|esTh( zhYAWG_=F@Zr@T!&i7m+RE_W+3tXWy>+Lc8~=hEqo83%md7s2~=_DZUaJJcCefvf!ZpVN|YM)w$-H!leQ&+$)?FIrOZD#y{dWgcKPA(FK z3sc~Mr`mg@&*`{$h$&THKhiFl3%6;Svt3E~#_x+0Tuyp_D1pddj)ZC3o!IiEtQ^&L zodrLk?b|ZDALDSS*7tEFq|THwF3itoO3K_E1{p(!fdgQLaDsf0wf|@$I7Hxq8*-B3 zeDL`J<^+~O90$E_5=92>lM*~Rq|IEJMh%UP%&#;t-fr-X2+WwkkOAi+Nc&fPgG4=| z`GmD4<0T=TfrN;lS~2a3tel02DqQ5R{3VWA6?;h~i3@b30*YmfLR_4lU0lYhagBQB zYC>}C>!G|GF>TJ2V=41KnRFtwuST+wIZ0t6bZ*-605zAeIh`gB!{@oFPMr3mBc~KS zC!ghM=axtI3s-qCnam3lz7pJoMLTNmau!))hJE0U30T>hy>53P_w0<5CE-lnWg zSYZ!93wNN%4{oVmUNQ-Hq*2iyZRfMx47~i3)w{wkamAz(@{BR`50&*>KfSxn7QfOZ zsjG(2jXEsle5EbKGTw~F$$yn@Qg5;6LRBwDo$cVK;I`b}_`9x9=@c@m{*4_cmf-QG z+)82-Aslrl9b>*W<1CI55MGP{$i{^`?o!)EdjzDA;UAvZ9%JSZ!Qk^8->xSyY;SHy9AxKk5J>49j6X@D58XG=;i-TD6Z z_k*#FBpZ9x#{3rJ=gG>C2hwjgp^2rNCrJtFoFs79&Qytg&y^|nDS@<-Xq*CCrPyQj#L#c8rj{+^ zk~U{?VC_L`%xJd*%4W5hMP7Ic(0KF0x4oQmokXiLULNM)U74Wt0SXJ1Nh?yP2BKAr zC#nZ^`5cj$4d>DpkDikrA#u|q)ZemA=rD!wvp=L)ai|4nsjt*gzeY*AslaZN+?JZx zs8kDQqC4NLh?Eiy2SlxUa=kDtIqFcOt{9a|R>)Kw-~fouv1)sN17|+Jg`?@6JR|RJ zvFdZV%wyCNF%ir}$_e64>7AItYS`E$#6120T&_Dy%ZE%aeFf)+Lu9QZ#0W>3p03yx zKNBVjQWm!`5ne{$R1oX4)Lja3EROv?PJDx#Y>euHyyH>O;H2fZIOIQqam2*m=olVZ zHg2tOW8gmXWwXfPM;{M55;yNTqL>)4+edGn0pL}qpvE)cxoCmpu~-* zv9un)K%wX?K~+_ipBEP`Ju*HrpgH+*IP@ZCl36cUlVXxMaryT72wB;UBF6;W47b*T zMuJz}W6uZ5s&D`vwrR66v@EjVM`V>ns%G?jYi!OVfv^EP{|T7`9!l+ZN~fC4z&nl7 zbo8mtnAww2O4P_#o}-$Suh^P!q>2a&j!Gvt+Wk#g8%G&P*CX$ggScuDf|8RYjSTO>Rtoh zziC|kq1^yD{!v`7eY+A>%XOf;B`CNk0tiFI6A?B`x3B2f*jR{x{`Ggbd-pqVr1hU0>NzF(ZK<2pI;J-WZp{5lPaY|!E;?pKJp@a zZwMzI>enmypy%cnMsPRPemBVc{8)Lpj2Z)g19Z>_x=D@Uv+SwFAhjT*k`>8AIS;O< z4*)Z9sqx~}2mlJ2mIv#^xf3Sawdrkb3<@SncpKa+a zR8}Lb^Ekf~8rLI@3O1bp*bZEP?Wp~OTg=_@0H69dzN5&%g;=tzz&!uDBrH8;_VM4K9Gj(g)xG6*?ZN@ETB=L+Bft%{|+#ig={0z70n!}NM ztRYpSzp_mO0ojrbHg|zkW>c?`?|Yy`7+V<6&xhc9xW8b2O#y8qh?j^^FGt8T&@6Ov zsA>A^<_dJGwZY`h0i1Y>@Yvp|HO?x*z$9(bQZ4YQ94B5^aQ*vd%qxU>aoqRxy+@mm zE*mM3HnN9_ku$>&xinb><|fINj08j&YL42~xSoWV!}-yNwD{ zUi2rH>3ElPG(>Gm7VIK|f@PjhDjR@?k6fn`?R$_HtUCXLqMq)A?`W22EE0qO`-gNljC=#)%jx^tYG0-0#*BZB^MSodh&TL$-J?V+fRAKo} zT=LcfK8*pNg^FZ+$yI4+*eNvbHa@C6c<2- zoYQB+ePOxCvSemNm1^m(JH9?y!b_Kd7f%ndOC#+h%1?MSj*AnO2|&!-bI3Rt^{xa0 ze~oc3EzEF$r+^i=L>PHGkdr57OSjmW1l~~VQxIIxc&odLx$t&yMn>|(i7dzSP-VWi z(K!a^vgI08ikDUfv(^1ciaC<-Zi06gny{a6Ol6=K|9y!9o`>w5>6%=ke*?s9;L5we z2(%Hj+!|p)QLtgz9~lu|?M6yI=O+F0q z>M7?cdE$^au;ZGhZGc*!S*G%msCGqs#oBo{=1C(4Q*=fiw)cj-Ig}$LD_F*8koV-`A;$2MzDtXE-I3Hy|ZTdP0u3*RI0{U_E?p}~?T*~LQV-3<-#bWdQ#%wN)H zNLrG+d)pVex7EPTAisX|7Yr~3XHOGe?z)3dHZP{v`-$=%_gXQNC1 kRX!l>Y|lj zx#jqA3dH+&k>B@u4m=5I4j5gy^yn!E^5X67UasaD_Co7FN#;K#u(*rVcF^TofrWs2APd{pMT@F6*CjHUM#lYXWn2ys?r5JC1j>$6e{H zl9cISXZF0^J*I(crZh_yciGli%B!;SbBfe0q1ML! z8V0+lXSe~kC*s?H`|rF_ZA>^z%1oASM%)d9wt~A<`{8uieDDqtMPnZNcBl;t*vl(@ z)c9B&(v*oZE$l}~g8r%jUXh25x2gF7_)AWs8tM*NaSS;I!$)qI;#!px-pH;Ky_wj_ zF2PP|t$af(lh*4f4)H&hnI?*0o)${(y>AmQdGdUnPU$|)8f@=1tXq-q#}9qADZoFL z=@DAr0(S$z0rXD3bso>Pu#TELX%uTO;xCwycYKGL_WUFp5es6Q`p9N_Z)@HG#h<5T zhJwwZUZ%oj14+U=3z@Exw>}rvt>nQpJO3_#XxD$BOHF>71c0Tvig`fg4*8d$2R^pDFt5@pj^6)Bf3yKK zE9V#~$kpGx)Nx6a@cB-Uwee!;VwwCi3PHmaj7xDhOfXU#${`bW3e=ep7LGYTxb*n| z#H7VMM!V@$lqmkf-D(l6%*g8^j&qISP@EI^bO%)}nH$`<;vcODv!)GenYI6D+|kJM zxnU9%0Z}Q3tT+t}3;A|Fjv`=gLZJCGk^}ltdK{H);&is+f|$^3UDk*9&bTaP*q>$N zgg?#Nqm5Qat!PM_B89Eu!||0QG%My*NMi?tu;m+D5M1d9*Vh0L-n`m$ki$)JezVHr zINyp0i7JBp_5iL&)+bd{;uiIOaX|-lEq@)PD?fgm)m8|MOR+Xf*4z1TgyCP<4C)}h zL$wvmUa~p+J?xSiHqrUgVP0~#P|7JZ93m!8zc4PKl=M&LCwe`V=S|Uoc08z!utW!Y zY~)LtYQYHEE#Fp#nYVpTZpN_mfXtM&>|%w|P`+l=&A5Zg_pC}}2RU_Niwp0;66(0! zuu6GJx1r{5qjFYysMtX8uDL)g-vE)29qT84E5r@`bV z4LXGFgGA)~v^mE6{;A)c`?cN_q;Ne`t07H_hP)Qx$tW6>F>o9+_Yoc!PK@@PeX$gd zsObA(j;M_WSN?T&(*EUAhu+>lwO_we`{H*_eNewWH9bfAoC$~*=i!%sVxP!z@Ta*r zt%iuQlnINZjgyDq*I6@*b-nN0R2vI-`dD)hj6W^s)8-U{*SVMaby8UZg_u+nY*t{G zc@o^zlQR5ZgcvuQIazSmNkcKcBw#_D60)tAdupPRD%X1hhDwF>j-29?YkDYbB_PzFicr2c+~ zKGhh~Q}tZ^A0&Gv^7n~8xHS|}J_h?d0O(N?ALFhjo<>?SR+iE{BnT^pjV6*P*VMhS zRf8cjH=Aa=aXDsuTef44#W0XnFr7MvGZwBctnFidRm`CHi^Nx*)k{Q($$_$=XEH2O ztr4xzXJsZ+Y?kOFom<3()EEa5w(T3e;}<5%>x}35^wMAf1sL_uo1=+rhDp07$0FyV zIn_sI`qPWS-^l87-|v_W33zd)Y+|F zbnmbho)}JacMTrI$Lq)WqvxNhYWa?`&c+%cD@JE?;tz8AZ4_#vhpLU)Ja&u8O(%BH zDekAPF9?WYLq&RWc~&>YH=N7MLINsBuSEpw-zB)@l@aEUKJ(P))*`tI{RNX%CO=o5 z8^6=I@TkTTFf#q>VM2S5qlvD7$*9}}^?RX%T2~4Z(9Ty}wuLHfoi#(vn0|)KnrS~V z4vf_$#V0h)+TT%z<)w16lT*-ekRE|N^UBeQ>i>7y@uLYKPWfXcruU*Mu;zu7t}}K0 zBQgoQO%w-<(X5BQf0 ze^ZtBqp9N=gY*zxG;hINME*jY!jTTyuV-cwc~#k8iQs=g&i?O=FN z1QkDI@hrCF-jU@z$*i`d;;7LLMXQY`xG~`ssDHkhQ^KQI{>n~ z+I>KFhYWBAT7MXHi>K6h!b|*9DodKie$Vea1o#YAf?mfvl-_(x%{IZQq>#`yyW9h} z#L@XA0E}bdrg14=9Bo=b+Vvq7Sg7+_fRe-sNs3)EbLjmZn~+2 zFk7E-KRxLj)4g%B*#4CK8^1a_P=H%oe-uq)7AJ6frFJ1V7^a0bKwVf9(-8<(}V9<-4=Q#kd|CQ-3?24 z5IdoY)4 z|EOVqVQib{lG%bHP!*DhPbl2ovUeHy^Ii?)AGAp@CwI(FLeSv%_INLD-RGu+3~>_D z3+0@0RQ){i-7%CcD};0!^t8cy_yO+3``1%XPfsUuo#ql7>)s>y6eXg9*dvP_?F`@I z>X*o=-q)uut{>BZg8&?`c>GJHy>;;z!d>Vs5y1wtG?$F`E_nBWJ+~A=m+JG0*Qk z)GpRPPj4w&>#dB~WM*1;96lC-A}Ju~t{U3$JzjLvPEDa`UmW8b1hV+-?z6w#FU&#e z=|Yl5HjSN=YCM+adTn2mZov$$`ywMgBX*|xAX`u=xnp$%WcWQbrXGSz06+Z z$!3Yp2!|nt-hTvTC`QRhV)c)^(Niz1>m#-C2{K&li%qB4y%*h^IQkls?EU;bDxXTyMiB zaqE$G@>!wK^o5dX@JtF=n6{BI_TOqu;a)`lgpdHK8F1TXJj59TGR^w6`hlN&CFI3? z_-PdQNF_EGaPQ>uNAZ)Yb(iEw=6@Ht?nL@C6(5L_ilZ6>@E$pzew0hq;xomiaeO*+ z61Gld>&R0b=x!$LUZNx5r5H-SeB{u#@j~X5M6C63DI#?63O^wPWOVgTAdoDl!@URl z6_r3Wa#1c#=mv~HoNGaVr^^$S_6TAR)8P(*Y`eO-k#I?MAK_mx1GyH}|4#BAx7=%N z_FokCtfEM!C|MgcsMx+OOkd}Q++PYVp#1yy9-`j=?l}LWg{AiI2H&EGKWKF0{-ZgS znhXnw#dFjjPQ(6Zqech)OM&-JjRnwIVFT<+sZV-;zhZ{LTKVB#?xIocK2)f;YUq}f zc7?P3xM1V=c7W)5y0uU+shb0UtJD#D=XB24YPzA2 zDE|Fvzf?R~kplQ6{$)y^6_Wr{g#1FZunCk8vtw*eUy1Va14>2eEg_6QPis0?w|Hg4 z92swCla5hJhwLdaGep}yo%cnTw-*%Hfqyx4RqsxPmxFJyt_@)|5P0RrP8NZtec9OS zjpE9JaeG#Zt-H-+iNra+{!uqZXdJQK9)bepIqVR){Cpe1tS?8&K@*y(IN|u(qf3MU zNEk8RF}~%7-E#wgz>!jsX<^?j6xpV~VEmW{wdo%5l|>&zUe$8i9S&tp+#TT!7cm?t)I!JhrA(9gk2&3oKS`MaNPa}|ljE7H6{)_GXm z_8d0V)qL1=IXllIGbB->6IM^!_mVPI;WC#z0a(lWtE*|U*#un>j@HuHm(5zjBQex# zf`@K#3}PnFd3YlCkQYXHOeFr-Vf(_wdJ09)0vCO!zgF$+3Up*C`Z>q2;~!HGj(~oX z|C)mQ-|hN;gECE819Ma>A=g3+NUlvPJE9%-2x=NM#W5%=KM1C&ymc~)hk7H2b(6*r zrhD{msI0u(j@4$7azhjKleAD@ZZk=zqHF(nN(5Sa3$ZZ7`^02yZ|*~qB*gf!rLi9g zoXAI)0ioV;+6xa1>fYj87t%CPa}vZ&_EpzHg4Hz$3(|bXbYn342R}@MQ>XY!88G?4a08qyOy8@BDG>yPZ?9e4928YA zyh}aYh>Ff+7aI%2_w}&d(7m+}Ir&20L{af%&{4lw+)S(rCyIBK z7BD2PT%9U55(@QwI`Xt-e3LF2wK3l>W<)NX!uLA==Xdlq3ql$(%;N{u275Bs2~x9H zY7s-t7$WH2jN@Mien3{i^Nm(AwW#RrmW($a)<^LRENom1%d^wXvTnG6n7n^xh)5Z{ zpN0IMGWW}r9A`}Gt+kjCP|t0RM;9q(o;XWNafOZd9kZN`3P9nyia^FTF`ts>nD|{- zBsPMovW@i+h65l|H^m{1?$ELq%Z#*_FXg8qF-^GsZ0kJn_p(QvQL_o>n?>SDC+(%O z0l_TOwcp04wB>?=Q&+!5WbUBZ{jyUWv6b1%*W9Md21W8@g;yx-f078+QKPLsf4tOz zH=H%fH(Yz!z9~8wJ_;-F760e-=Xt5!&~=$Qq~ESxm5KLf`XjX?M;AF(eA86Q`7xSJ z=;&P#yX%-1)k%RyQ$M@;LK`-HP3Xs+sHXgOid<#}Cy*E}^s->Oag94MoLh9MQY#Lx z=(hA$lCpw0o+W&m^`JVesA-KuTdK3k=!h{iAgF6lC{o+Fj&n<-NR(u5&TudVP-tIu zl|lF)N+wnnIl8n7sPK-@qb%U&3!2Frao>~`0+Sztf{7iC2&2xIk>4JJk!za~CB^0ZdG0mFr5w_Omf}far1#{Gn z;7zUIdmGU`ZyM#)K?qS(*8(n;e7px3cU(xUv zb++g$6Cuioi5zMUAOfxOsbGG6LDAEhj zlUh=V0_b_Ue!2C}jI1Yh9PqKmu}+n(|Dm?zCdC<-;+J~Z62|IUhxECTYi5mDwU4O8 zeUe;o1b9ai&yfEGp7HOzSU=KU1*Plc6$bmyeF<=tH(&*4g zgid%N%zGegmMahFKhZ6*+H_1pOcDlLJ?iwN;nIsw^@{(IbB2)swy=Q#m=;U(tyNCE zC-j^xfsNdd!Y*X!&#s?;!I+_+>Y?f9bkI(o2+C&Zn?2T1S~4|_+A|yp1d&A`!ISmE z>C-Aa@!#(;>_okBG~ZHZjH@4^h!x!oI}wm3*`_0_o57m+&$8x+=uNht3V^nAOE|Rr zXj>NVoeh)pj&5z|7-|SRGWwao!b}3k4=>(}vE&FrmLfB^7)q3)+h9fBkK2h(cgnCJ znR~XrHIi8mnk}YweC?0{7i)WhS9oRNp<$Uw_GFuOg5-3J2?-c2@~V?E3{iQLWQ2NqVOSK@Gv4)jobu^#k}Z8~_3ri0G9 zn_1^go^>?p8CYmT|F)SCbUQj3`p7eMKeK@tUdNCrhs}5Mg!)_#$Lvt_7mUR_a!^V9 zHSWHPFxDkc*%%I#cz#~|2qB>SF_1FTPpDG=rQn8e!|WNnxyWMtrT4BjW~@v&y0?YFIHcTkVVa{h`-XZZ`Vv`e_2$O4OxJ)FELmd;@)Kk14l6y2~^yj}+&{(Mx=cF8+Ra*Ik6&{YP&!04S!*074HQ?hUYd4e5WAXe ziKj`VDb>rXpUackvje!O$Ho)Zed_K)4;iHijH`?8(1v$!iw%^vNOE5Pl^5L+1nb5d z+uKY_dvR$ozq-*6k5%G|^>o~>={bB7`R4HfC|pv#NDdL?gVMAtT&G38C!Vd>@dlfj zzay0O$lCF|IYQKad_jd{udD?rd?olh{Jzzs-2pF=9VNt`R2jt9KW zY=Yh=M;lI6vvEqi<^Ij$+@IBeKj zqgozKxVjJ2fA#_xrp=5t_m|bTK+HW5r5(8dz}&B%u^cuOUKL~%hkKt&S5P-Kn-SE+ z$>YQOrc-7e%2ksE0>~}l1%96utOwgARWH#eDdjb$cxE^PGF2a`EO)d%Ku0h)tTFFh zhu>{wIzV)rSaNTs`DVM(-MzAI*k}a~ju4(I zY#5`%+`ejkFNS>Cxx}=Y(7`oLG0)AXR{tU-wa8~##81l%2;$x*PS{Hl*Xj)cxr*Q3 z*9cID4IcDq5nv9B{)8tJMU2pJxH#TQ4)u5yv@D&;l)+wY^u-xPBJZ~hrsxOhp#j2u z!peMM#%Js)4@<$8ObFere9QJ;CikccTcGt06- zt$TPzqYPf%9C*2UnA4RkS@xR6e1JRk!fDca3bCGxzaj@sH$h0ClVl8+xMdC6pV$xH z*}tlo|4qnB;4bakctJ=S`MvmJAVl=%#aT1t>GcGV0y$q1ezD%4I#Y58C72zgBi;`` zIqFoQ(B_7Vhar@j-eKZDe2^zb{i>#g^tjv_dQ)2)9c0q8Xl7kV7(jkuuys?YU) z(Y@0XU!Rp7?p;*HRvWz-Ynp{Zg2D^8gWq-`4E@dER!68G{YD_AA9oZrtP<9T65mOf z_sDV)o~{e0M!?*e(RmC7B)DRgUn>Rf&o|Lj`l0aeC4z6g8;FYvKxQ#Hrz-J-h!4te zoBymdVH!Z}simo3x*6dgZkVB~c^KF!tEp7e+uA z<4|GfSttOZR03Uc$1}q4&%&4IJ-{m>&jUJX=0*WBCZxld8cd{N9*h+gUwH>y4Fbq~ z?{}M1!i_hAG}}y(hU2fHee)*fz5cPdJ=hb3*Fz(0vr%)q&W-qZE{dyK3M}=r=+t3m znx*|f*PtWMBSXc(qgQ*ru6aveo<#j!HkrT#s-zE2aqUTG+H}$bbDD%8X z7En045EXuS<(o~SRfeu(IYo!0mhrwX>9aDminaXuhK6!No%y?|AnJzQB)UWBCI;G? zYiDo6d%W}rlQz$aBk-K$vpq-z@z{5*74qg8<+7vw{Y%A9ZrM-IGwV&O(YHp*nvjaM zcFQA-KtRt^ryrd9E#@%$%w~UJ$)*r)G?^4X3Rm(gNF{y@AFisZKCHigZF<$Y2i_7J zLtM%Cxy0?CfkFRA7?AuwZ*~);$i^SW32pYZ{=c`xYHrJRJhlVW|G8g zKR}z`>Og%qP|Snj7W!^Picyo=(LgDLhhVQDQqs-t!?&f+jkf1{1ZsE!3i63Y5(36w zg*c?W@yI^BH!KzqcvH@*6zkC%*V%oe)=!L^UCfF%8I|X)oom-ykblC{44RKim{!FB z@C!I(fj{7eBK_E_Ka2`as(c+_YG$DW(tNZtm(QARc@U>Zd3vUEVHs<__(y#>jJW zd?u(6)$HT@a-4cpzKh*mfwZGK`&sh!yZtkW%!DD=0FSdry8m6=l z|KQ+YS#uui$_0YZopJgsBL7~+rytR}9XRS?Rz3**^8957AfvNBK+X)Z#gSAr$%i~P z*nCPI%xKgdeep5`YqvY_1YpAz3R`^)|sKkyCUCt8oGQ5{T|ImIcpXbOF2yZ;Itw^IxfAl@u zVXyohrJSg+p~HMt{?lG#<#G{^Kiyi{dbjqs}@MPuG@-{2VI( z_?2VBxjd==vlK$}&xaDW+VpMD&mWNbxNOdi$OB)TR3d}JB8 zPKYaNMF(dq=DSw1RqZv-wIAO~g)=f$g?lP(v>ysJrqJnNXo(VYm=0`m<+bhzWa~s{{Qgv4wQE0v?r)b7rTnz1;LM+C8E9^S3t+G^Mr09-ZPToVJCTS-W z@>QMu`Nr6D)agM^<4YoCJ{{<8Oa+_c%U^iiRyHnAac_c{ zBlhA-fFY*)9D|W_Vu;{d4f1cx7}}vfN#;wPO>Ps+vw_ ziM^(VlgFTl11G#}Ds${5fHJ{%fl4%K+aV*4TUBQ|4)ePQ1p5U{$UD*8+gWAMAN1WqQp2^~^jDuPIqMv^>#UDp4>iz+Vyo#Mb1WGaVg2eoQm7-EA5=3FWvwOK zb#X&uv0`Am=Sn{aPh_TVWqYWFh`cKeE0Ezb?s5>PUOz?y!nYR^{xKu}38VP?4E%q; z{{QIzL<9FS87jS(554}=j#^(q_piH%Jj%-!q~v+j4*V06`y8FY5=E;bKPX$i7MU7A zf+;R%%a1Csu+9?w!#&j{_pX`r$cplFG6sqxc@;ROh{8sA=KJn~AA=8PQ!F|Gs#^VM zncT^lVk@z!-MQjT>B7|8P=U<)kC#d`Cd!8XVM(0QMeeh)94a%k>3Z4JmWSjiX{3QH zheNjfTY0tmHB0HgPg6^bd(ORE*NUxwv00+d{@_|BKWg`u$>uWoV4=7tZJh8MQKoEw zhwURz6Yy+_0yT?(wM5bEU;1Y_M6-ZK3=`i#dhUceqzpSsqx(RenaWiBUsR+SOhP*5 zgT+!>uXi`*o+&cD5HxTuLLqnTjY}bE=2edEt57b%}#)6{*#*FUH`6XradDb zak^2y0{thCrlQ;J%x<8@!Hi4^PN;aZ*8&5Fdr_&*L8$~^;K!w8mRr=ybak9c_jOUm zUGs$etSqgU+M$vv?_{3OlsEy9<&oR|=S_eyiRsg;yWJ8mV0)dpGFWd3>%aX(pWaTwTh^oi4Q zjmepG^^1^})SVHKpc3U@atH-UZM=dhl)Zx`wNrEw&8(;S*+p_0R)bm^-L5#h0CH8O zt;*c1Ueuf8seByvuUD|VdniQf)*-q*LuN0!{uDht5cpdk#%fp$c{D!L`Kn_~D zmWpwfnVc!kSh1+j-K*RLZUsdu^Vsj7&AlHk*zIC4uR9vUGt&mOLXnJ+;7mHT`E z-X{3K?s^C}-tFDlP&wbgBem&LO(n+Kn52Y;s%joidohB6nKa_#?&JK5dqZSlp+#@- zi^Yqj{bV}!cY<=Gi&&Qa)%2>alCr$H&>_xL-_S#CmQeR_f}Tpkf_2-moBRj}hq$=w za=51%)`z&Hm<6tPALfI8*p3`#0-QnVqutAi5vjs8(1`QyE4cMb6cfMn1a7{QPaZlG zGj?IQIa+AamK4%psjZz^P^r}BLIIkrwUm#wAaVh<>8Q7>84x%#ZdPn0-0>$jO69ow zzmXgD>i^#JFKgTXgTUh^$iMYr)!Tsy|7T&qD!pRG5?~p>1Z)i4KjC*t6vM)3N|N<;qRx2E(Wu_tPhiK5CL~2 zOadPjnrz_792$Me9zFxF7ya?w?g(# zd=<{yq_>QozJ;UL1z>}wtC%`biC*WzGTZaD6@1oaNk2&k z!cJtH<86-C%Rxlu(yCb9LvIK8=mHx33|m2HIq6$kXPeFy2ujioIeT;k2`b~2z;?yZ|D2%da5|SDM@NE z$XZN0&ar|#+_qP)BR?Lf07ON&bnUmx{z_4%Fo2T~nsz1^E zF77b+r=DO1(9j+86{Wb4F-m{`Hr>S)=N2}T4bM^WE5Q^3t=DKJHzlh#h@%@*X*H1U z!}wA@T$fOnIBG#(y7YaS^6jyz#a!{>acYzGr^t~GQSwWz6uxqX;77hOQ_j?BAmO^7 z%-+Iy;@2#C!4J&zN96`(1mT+x^dlMjLS{KC$^1FB?n%*AfDY_=<#7Wr8vh>wv;6P) zd;I&M-bz>D-z4&XgNFVii!SU9qQCMRI8?r83r2>5NgjKH*L-}1e#etgHSGkBn-Uee zgzBWjc>v@`qJ=Ser${MU&dJa;;`? zyN!04+q{Y=8YK-7#*CXvp9$x$ZToijO3U`9Tob~G_^L6gU%~zjWRDLQ?4W!}$A_=Qc?v5=9FC>3ai z2eR%2;u>1wi^#+&s#n#ljB7BB*XVW}Qo|eNOH_wk`fxX?m3^z~4QHQwrb}lpNRXSXKQ^${mPBdr z7meG#5k#MjD6gf84fBz*^Pg^LPGL@2!)TXv<}8ukQI&DQl$4Qecz7v#l^)FUsgPIO z%0Xx>CMPfWT-EVl9F6Y`!$^I|uX{J`A`5)ZdjnUm(Ln4=SRge)64c1a+)U9}6Gt4O zg8iX-5C^lcy696S1>sgfY&Djk$&aww%-0sp#nj)z!m83*d7^&GO3NxMS-kCMyZ|4( zSiUFfkisN7i0WsvHGj7h2v^LbW?zw}z)zTGD)gvwIx!Z2{nNi-=2@AX9e$Hf@Rfh5;fht| z9g9k#EYn*m?L+#I{li-(&lo1g9DB2Ej(^iltxkqpE~oEX2Cwfp6WR517tt@RR99(I zw|U$d?0q2%(udE((~p>d0RuNQYZuI8p@|jcHm`kC+7KCvINeZSO8La;J2)m%0euq1uy_gA3 z$z3hn;b+MW^Jl)&pq&vTLw?yVoZFHs+1dz`y0R>V8Vav%>UpjvR$B*c@j`dPI)RCe zC~;|lttD`|8rqd(v9&J2|6uRE!+lNHLIvmO>4v6vYG; z3D`hIil9bBN+<@z5E42n3W^;|Xc~b~L`6pdQABW5)UlrTiH_&YoO6EVy{`8=^Znr; z`wAin;mKa@zVEeG-uj9ww`=7Jj#V%>l((v8rmzhw-IXj?zO{Vl;pPs0TlfYN$sdno5EjvC00K+<<|$wL;C%r=At&~k zp0Ji20mU7YQ-p~Y;?G;VmdlL%-7NwHxeA^KeoFesHxR?T71P00+Hqd?d77=w%S6$^ z2DR%FysRlpHuY5$1-)t;o?N?WxEL*Wl6SQ#O@0ZNw>C-ZT>K;#!04u2LT93D3ztXzN7LIN7dAxnnTkul|&8A=@P%bO>+x zp_#<)h=0-6B+2LTYnNS!N`1$vh3Xot*Wejh5UQ~ZP?+Wz-yHuw>=OQ$#x?1N6y%7soA%xCMPmz-U)r} z(LwJfhum-@_67|nwt*o`A`*YQt*$v(uZ-o~maNL>39VHnd^5@!CjKP!?`q zKGi+nAfnhJWQ8E8`(2%SMMsr(a#Ps66sRG8xWTg9#(E+AF(`>-*H>E!ZV#}qpun2R zY;;#yp{uo9+@v(pa<-sxDPnPI5rUM4b?S*^WY90!$Bo#B0xh)Xwf7tlvN3wPnw^%ulB$ zGrof))xSF(vBUY-T%1F4ul(Un{#X7|q6s(J#jRT1y8Lr10eG8%Hosz$(r>EXs_|o+ zEn~0~2-1g`xN(Ojl0w=iQN*to7{Ln34xoGSy{FZiDMC1MG(E%^&b<(>x@?!$;Q4Il zuI9sUV?GXVqESMM~x_PHyBdRPLTydQ#Yir3y>eW}_%Wuf%mG2*>(i)EF+@1OI@@rco`TV-G(M8DO zbW(I$;z+}LiRjvw%$h}3Im4MJO7G5uQ>YqUwcF)_CbIMzFUGi5mr`PoQme8On>rQK zeGmEOz}qpW*WZe!9bn?rx2hf-#x`Y)=}QV1gQd}aTPDNq_EJdE{3e1&bdx}BI`QsP zo8S}fnGorY&6Z(aZ?u*srQ&jty}MA_arj0=dWZgl0h8ysc> zw`o3ilVq+7zYy8)<*s2|9z?hucdPaSr*7%0Ugo+0nrkZkYTUw+WWOd{(i9o-CW#D> zoduJBo}aU?UHjx!2)xO7Ky~HoZ)xfHVnl(WcgW+y+j|>J5sR4I6!rnbDy)$qOlD>r zsPwyQRN;m0H`jk(hm1^GWf|;c_T2L=|ER5(QW87K%<9)C6m2OqEj>O2nI~!rU~KI7 z)Vfq2Y6fd-MecbFTPtVx@0r_MBVFcL$jTU z!%a;({P+(0 z!cJv0dRo^4Slxt%xb01-jYn?`vucLsYVqAQk%gUv9VC$2O|FcSB|pe=&j=l7xBM2K z#H>|PiZi}jvM}JZfAWGZ_gR>2TG^7Ysp{T!b?#|e!!a;3M)gDg+w_0L>K`%t=i`6s zO#Z1>y$5j0U;c-7?%7^8s2at~Z^R!H1*!t&BXEJ=7Xkg*^$}(5VTe_Uud}Vs4RB?l zVxK9Q4C32~-4Yn4>nj++(S)BC7*2C}1~LXc~*r^}V9tCe$<=(jF@FmqiXf`N6SgS8T`7{-eK)M$G7z#%Qn&4=X8b(HBEHeuW#d>znVX9VpH{p zO!aCde?0GAisI6JO|CYhZq*g7%XD_h?QbIN)MVVKwM-`vQvzRHvh+256FGb!KU#b5 z_L>i_u`{FT{-)VolYHOC#O?N#mk|0=toV2QR6Tv?^?ZwS)hcXZ_ZBlF7%)1VDv#SE zR-1K$(%Qz3)=0eD(@`;Q%zUuTYF)JYS@^L^d2BVbZ#I1Z8$94_SX|*Y=eJjgGmp9QqtX_kxy9>oH(4meLA&gZUF<|q$)Y2U_ zS?${0b@Q(BCSvqj3^GE`-nj9u@e!DCcN}?ghIu3Djh%Sfy=rflltsULW7)Y4ZmRrE z@luz+P7H6|G_q@$qeE?@*IAkFziqdqf>q|D^33E$IapRNE$z}kvC=JjUNo^fu&oK# z5jvmRJ9xm(I5#a2l$ht>_=Wn<9S?6nYt zwo2OKW;_>KKFtxBE`6?uF0|OhzZ4z6)_9mBjDApN(|QdotRj+PdzM>yxK9P&v$C#; zGegBvzV6h#V)C}>aWm^IIx>dsk*O7UA@R4mg|6PJYgxxDO&27luuBL_HAX5LC~(Hn7x|Rb{b#TUu0auR46msyH*us$F3?Ct)UM=a8fp^T9j?f=dH6MOp7agaU#9 zb(m}?+E-_N8L_F8jMcFroKUTpKAArnJnHk>a!L^G^R@YO*sZoQTLsr!Z5wxzLT~D$ zt~Dc>6>FWWUw{hcioI(KaQot#=I8YnoPp_q--Lf0H+lchS!9jW*H-~Usj>FIFE#>c z{_eYPKo$Dm#kucwtCGKz&oy%y1Ce+8GH+26{a0D_o4U6)+_Bl<5$YaiySa2`Tvs#3 z_T4ollh1h<>S7KZyJk{syF`ZCueW$X4dGCFot^5ZA@xl!+{Vfzjd0IRY&Q<}E=H6O zB@J`2D-+Mm$K*QYvNH2u26*NL<`$W*l4;fk>({=?!NlMoaP#7;3I=2>I?30AmE5)mLt(cF&UX?~uJ?~bVglp;=XwnT5(DYWbqR6_Y-j~Ikmt?Qhs9Z(mQH{R! zudB~4FC^aWSxcTzddN#eG@Rj$lHOlBWXE_t@sTKe{i{Q*jqJr*)^e$R7z9{%I0Fi~ zC3Ni4v2P@g3& zBl$-f^43RaR+1welJUki+A(HAwX0aTNPk)A_>3(3e$3Uzd7%v%j>8X{-{?uc2BIO) z(G3zgOc~Qmf9C{x45GmUHyco*^2Jm+7jIKl@4p?^3Xb2PacV4sNidOzw>DjnIGA!n zwXz21BWW;h!*{)byX@sSXjasFDU8Jkx>j)4IA^#v8?P*XcDvSaEcfy8aBaW^ywbv! z@uPbty_dxq&6${zSej^46}N`-n^lG!3p5lq<@=l8I>OOcY^w2VlXS_VmcCjgpJcQ$ZL-nBdj)~Hv>2K*I=T&eF?=7z^M(AmgGpa6`+j6I zi$+eXxd?4f#=jWaXl4ip=Jbp94C}lZ8QxpY?mU&p%hVQbeCxJz_|;tvmoCdU4064y zZsr~K{)Os8yoe6`3zM>w`IKP$vw+lh3BO!^4n3k-6Bdw!%chIhPwVJBRP8fgxgmcf z#IZ(F^Zv3@ld)7QFvQz=@XA9lHo>cD z+t8n(zNxEwE>hmEIg)GqoAunD{5QJ+7`ts(-+zIKe24VDVRi&ZCbX*@g>2>SzrIN` zvm()ZJk0M_T8M|6+&R@6vZJysF5(dPh;D=uVv?6MeXp{Nb3gdNI-mVFkFkP`$vKJh zn(RJAM>-C#6`Wl#Skv06+>M4Z!CDUnNt7A(W{p?Sz8liB{^+%f4Z3PSD*LArRs38e zY6}#Wpj+NQLl;?%(m?dh|GkaVR|?J+voBsK4n!hIN$L3&p5{WUk)IfmJD z!J#ebC1vT2{bWsosZ+5;lk-;iYJhuTIm2Hswt}6yUKK1MxfQ6%kVw`}VxD^H5HsBR z+}pz@&o;S3Ms-aPB4a^PI%Q*UZ*6U>mWiGcFE)GLEzS~}Zl+K*z1C6)lOCombHA4P zGCXm2e`s^L3G+@;foXil+GR>OZEK3k`N6@kt2GkmA2jdWJB&F!CeeziHg;KQ46HE40F3viX|%{t2q$@dWDXC(s_MqCFUO=RYp+&gcm zalTSGpge%A8@l=G0O7rWB8ai6<852WMAf3Gt!uY+)jX_co~D028O(Zb6uwx+?q)j4 z2v1Cr2q?@K=sk*V4cB<9LX_Kd{%=sZ<~5tsJ63EBoweBA;Ls$0aA6J!A69IJWbWF) zrms3srsj9%ehuchodxE8;Fn&Psk_&s&nxF+HYzh_LeEw$ES#oJbvaCjgRbeS={5HICUrOU7b;twQkbiKyYH6iVQS(T zEVg!6Uu}&c!)Ps=z);5&-4^ZzwF_dI&h(<`aM~{y*6?G(Pu!Y+X@lK@I^@~x4)W;c z5a+oDy>+kEx9h70+iFY*d9UXz>_#)u|{>I%l_AEqk;RP}D92RH+y)Ng6cZLO-(iDQI#G^+7sVxv8Rz1juM ziUorHBidQ3T1pMnjW4YyzsaGHRlG9&WgRdRM|X6;47-adikNs^{B}`rn~|%R2_G&} zEY9P|Z5w*&TZJ*oxb`TvF{~)eJ-&#V?y)U8Zt}wIroOQwlbJa-u}|T(LvIRu=(iqM zMqM!05uHw|xX7ftP8RTco*1AdoQcp&Jk~#R5wg4~9lY4U$xuRMm&m-KO!_MaU<6nWsKKunb;TwCem^ zW3x};bTH5708a~1KF2%1)s~ZadxI0BlY+x{I0)=4NXSHbg3|!4ahAxFvp}Ph+~$Ye z9~f?lZj)A3*kDpFTc5MHe@l<^HK(Bc<{r`t)}5+_y-ntE=L9`?gS+{;_QrhcC$57% zCJx6)BV_3voa1N|pVO(UG1;MZbx|0a^QQw>t2^y`Z9FmSA2yU% zP`rsImGhj(PT!w&n8M**{OQ)@Cy50oFC>A#$8kR8MI5Mq9bLsbn4h)pUvvilxAiZP zcH)AoIA3!6gFh4dHH$yxp%9R)EbZBGwa#pEL$c?IxV@Ci?}zIw?4paESflfP$;qDz zdvEsUl`trKeOB%@uQ-ay72_tlnx|*Dy3GO&8^>$2{Cy9!=@u}Z zBg(iN93v{)M}277T)5rTp<1&TJIS?Kh+W!30r?x-ceACxDL)vdzZ zSU!%UeJeEbu7Pj~kwqkSv|3(2Tg(;xj&Z^yUypwLAHM@WP^zL4ty=f*KO_DQ8dbxP z$A9l&_`buVg+*e>^FSp2 zu}0h%Dee%)!qG=Tk(2{~CUs2o!Fd44i3V`a)hZY|*Gq+X0<8fFiZA0`*wB70t@+o0 ziu@D|d8eFMlJu~=6WMa`lwlL@(;NR$Xsi@C|Dd$;{_i>pu_oes9-`DjL?K`7%If6X z4@5t6WsMJ?SHO*Kt(AKd9L_#;Cr1=F?l0H|7on*+Cu|B2&<_CH$^n-Dk5~BdO8@%! z?=u{r5eD}w3gThJGk*I2o)7)-9pDtmM5NgMz^zS0)|s^Da?4*^mp2P-3yyx2%HkFj zgCpPqD^O7GgN=H2C-H6%5)%{MZfw(+!g2JgnQlh<~+!vonO%No6du(UmFbNw~ZauK9hX4 z;fc``H|u+ucQ{RkClmRnyw8|ipyHARW{yfvJm+}b4Mq>Xpww>fBtMDT65P!bhJC^u zd%*v+tn^b$V)FylWd2i8%_mIeZ+@t=d3$85in2PX*EBTBv?!}BGcyB14d(R9-X&Wl zRM+1q3ZPmBU+x)i=6^Cc_P|DqpQLJujZjOh*e0?m=d7n!vCIQk5Qr}IUOC4-CGeWf zD@+Qp8RM3?9S!9Bj*;aP?`khrgCFvAS9S96n5=Z*jYn167D-b}HsR$Sy+(VPKYEyz zWpBTPrQ&;4v2W~Hu4$5mz%=p%!BT+IB1#%0#OYbYZC9^7rd}7bawUZk@Q!Cwh^>^; zeg+dQ8F-+>N1$qy1%{^3(ONzq4watc78f6cGV%t}IODAriXr5JuFUBfyw<$h=9U~s|8C!J1{XP2uy6|cxa+` zl8JrbC0LN06iU2E3C#eKRX93_eDWF-O|O*2qB-P~5d@SNGDm((^kT#|ECutTncEhy zh*-E0{D;`j0V$UNKF2NH%gq!wDnv6ScOL?^)6@0CfAEe!yyVZve?*M`Z_ffWy!@fM zy%0B+YQ5(lFyNL)S0182p19Si@>%2KimnfrK^k)YTZZ`0IejGVSi*2_vCQ4r-kRTr z6L(*yYR0E_FCX~4VWp?`!Y98f#!+uL4R0-&IK711+T`F{(Gs}LU6OF~b@0Zj3PpNX zLQxqS$<3#js&$XC8@-Y%J4Qv~g4HxhR{`h1S1cMg=nE{^CW=bq$TUnOb)YZg?i?$x z%B5dT%87xYc!``hJ{Ke^S{Iw+`74cIrosw^FQEX~%PIvViS{!A9teSvWPOcR>^?~| zhtTK(vSrxYW%bb>sZoVwJ(FBGHE`cc%BP@HIG)zgp|Gl_dQIJ3^%L(@am67~IZT>+ zF4p&Rpp|B%QU5*u0dETFgdQKtf)PtQESc%qinws*!4p&nJppwnh;7G zj8aE90mB9Pg>*rhQi~iK3u#o@bMyL;Oht8W+5$4LWvEJ>*TpS_XOVs0u$-(AGF%>q z;vwazM3le@x+i}2!<^XMxf;I30wtPWy7zFJ0t4@Z9|lGp&4gryNG3>T49&nScMTp) zM84znd!Znh1%f^acmqk00!aL<{r>uz_utiiW&h}S{x7-Df0oIi)84LF+O%qU89MdY zc$m+7MLInZ)mpyL`tY+2E8PuGONR7PMQ8Gm(q5+O7G{NH*{?CTP54WqDb1~0+QZMx zkNMoNV6?y@A-Vddwd14a&S8#040EYcpabDa@wx6NSe>{zQjibB&1Plsp)BeN)6B}88TI-Y1LAyjqcpRodzn3= zrZMSHtt*+9LN$&05>j~qgCyLNW1$#Z;QkD4w>AcD%!{KNyvnm0v5C?SOC>8tZ+bgc zk<*pP>u=L6bnQP(-M?HzNLjzl!IQ{4ni>xd=Hmhn6A0TEAK*MPd0UIFF?d?be;JH>EHruJr$xELhP*#u14{}+%Gd3f z34Ah5G0Q%yr%fLL#2!GSA2$(ld3 ziWE25oPMm~(p=s*Ld8w14}mqOqZSYy`+vclgQC;m6Ywbc>7}I@3ponjog>9zyAx7~ zme%47k4eIN(GY%y;7hjH%kYqMyecMu0~Mv2ZW2xElT1W4L<6-eij#ze(7YKedde+( zj7e2onk9ac4-N%G)dj^lv1tpL6afq|m;|uP0VM?WiUKfvj>ILH+=pPJ!B>0`vvj_t z=0C+_;(X^XyZ%jTe&v5F9uwzoKZ;c{1u^{|SF?28Z6y;+pSB?*pOw{=9CGp@=3zz^ zqvn=Y)qA!bJNCI9>6Y3O_5O>KWo@GPs?udEl7&*;uyF7~?WFvCC9HAjZhBem18=;#;B&mI~t}SB;m)|aVL26^~x*E0p7*#^k-5qmW;z+l^XdLfzEY4gF zc1jVwjf&=SF zxkbx%qY$|X7EVK9_VIC}$F{r*(pq0Q%zuGdN#ncFO|5{Ws31>q)S^sqBuA$$*vf}N^i`B#E9%;F zWTFdUPCq+k9s#7j*D6p3O1++;+N8?pahHz5{ba41I%B{ zYrvr1_H^S&Rbj|PgaCYO@=Wj}^P6iy>*2GH?c!qm=k?c|!SUte3wp25cz=0zWiIXx zJz{-_&IjXRmuW0JpKvX zpg`bjE*U6G8(%iL**UV7<1^g({ef^@Or$nvQW0gdBvkSsE}FO7hvD?(<5X4y-p=`8CtVZo!1_d=jKg4C4$N^~M@64`5 z=f~4j$Mv6+S1tLpr?T8tisN3=ypph@8 zc%Pz(y~oMf3)>XzYr6JZw>LS!8=u)X-Ywe@=(S5=KXRh|tFl+fCvMfoS)tT<&A|*0 zl|D-1KJMY3su-Z>-{v;uwouB$^%?)JhR&AM!2PaXn2}QsJV|Er_+GGp#>C3HB{dLr zeLdgU;OM2DB6Y5PXP{0ID#QAA{|9pVZAIKs2EMtHYw-11wQ%94PgSoQA{7D?1MA{! zb2?j=?GQLRX&)+aP6_HAZ%ipBIBQb8Seud`# z@lCeJ4PLlwIXsUeN&6$lDyk;7jEW$rYXT#dMm?qc9){8#Vb zAU3}kqo>+L_eDVMWhhzIV3h@{f+V=C`zXx@ZR91xU3|B z4gqo($TY#{Ces!G_oeWi!8!bs!TBp$tFg;g3ZGqo#tqDu1sb&Xrbd*DKTHP|T_0Zv0%`l5?WU@HlTHj}o^{ zRbD}pd?@?q4Z!wTWiY=hKEC+|wm+7CRp)7DSj?BOCO_k^&a;+NUlhai5w{uP!8hh5 zItJTZK&}5}D!6HONFPkRXP-AR@5s7U8P(m`z21xHl|%{Hc6bRic4{K3oVj$dZsEBN zDN82VzU$A`mtED$r6W|=mL!x;yS@J6M>5KJ<)*oqwZ&O8v554(!9q0m(oJ*SfHON` z-HQ#Vx12L5jT3h&YnI+iw#ng@zovgGjVUDDkso&1a>q26iz!4v!rCB?m zxrt|RoJwJhmB^?Dtv5la>aEfuZE><{KWJJ={5W{fGODZgz{J3UmbViFGK2#2WRBG4 z6u!Bg<~ieA_A`O}M`D1iv=_tTmW36=b>e~-uGMBwIYU)Hs5eX+l^g>@k8DnOVY=Fc8?mN3U(@mC%9vm1JClHn( z8|M%KZY~6v7Ybg&iUny1nA}hak(gBfL5OrO-Nyrg5*H3=j1~aZAO#{Aw<85_j zaUg(xyG^@Qfj@0GEqdp>b+v<>GKu> zgPmSl(e;_>xr0^6CMzGZW6}HC%vZTX8Uh*hhDoCv(?(9Bc$xA+t(~Hr?Lw`qJ%rnB z?PGb$54mF{62rMsBdJ(a6p3*!N6sPR#@#hkziU-cO8*=t$uczcbD#fWb}lPvP&66S zY|^HU_CyR0I;V)11W=wUa@Bm72k0m&ZvPeuo%vNB}IQ5RsZ-=7pPdGJwRQ znUT^=DdQA3GYw863eyhe-Xinn1;E@4Gzzh^znm!SAOJ(836by|56D}Pq9wFxFB&8c{GihRo+Gq<({>Q)Snbnp^Uz6nFBBq4 zu(S#mCm>{vcQ)by6~}@BefF(t`T6*tKL@ZpX|2Y$h<-^N(PuUs{H=rW>w6NIIkxYk@Csk;lrVK$hCplR!<7flD!= z=607*8hl;mJWQyhGnsOdc9q+1Or7sb$tWi;pq*5J0sfRiKH2e(PwC?;zu`= zAH{$jH%6%&$#>lI9*bBz$=hmbjPIqXk=j27;k|q2r<_HNJJgfU?5NwR7;no?>Ca^Q zzb#u9K_-9>y|E{g~O%%EUiHHGF8n@+&0gD%oES~Uc4 zUpWX3((QTHY&06X^ngze0F|`>&J1d4DGnGIhe#oTHNV=mV4|l2Ad4Rsr^|W)O`&9! zSQP>$A4#XF6=Z@NgSZ0>vL;6}K|Ex&WHTW}u}cu=`0zGvy4$rJa=+VNSo|O0Ljh5j z2;QM4sNq2>oelN7b4x+0#j^kvIAs9|r0esWX+JS9F8CP#$_v!oy1myQ_CdM!dBI&*n_zJSRfu8| z3N@;W=IR)lk*-f@^j{mO{u8^}86Pw`i2Kxgse`At_TIYwIj%s(#9(C(n-$*dEO#bf zO1;50zaWtj{A4X51T7z7tS|X#a3Qt!<;USC?Lmk6>uzn!e9PBJQX&YVHnu(L+Y~Zf z-r0sJXLG%FACXB}bLzOa4Te!5uOjdYFovOlIN6Rp)HTgdD|L=KtE7`VIzQ8ph?vfY z8Xv{Gyq(vo(Z;(DEl#tLN#WuBX+&e~xFckH0o`$8j9XwaA1;r_ydVeJM58_~ zFbO2FOPoIuxp{Vpr@gLu%;O`Qy2jZ}<#W}#HA|}pSPWmOnAhb)cyH&RQ#Z~e0$b<6~XWVKu{aex$4b8==F@U0H+~fht!~wh9 zu)1n#bX2v(X@Q|Osv1f10%S5k+C+N+H4H0&1ll-75{OAayfibSLEH>F4T1J15KsuL z_LLO30Wt;B%t&oiF97n>;5RsQC*L+tCI#p=AeJ%8e z1SFKu?=K7*+ev@~17sNl7%(ul5^H8D1&}(KkwGxUWy|hJaqji=`fJW0MB~-o@yB1Q zzRs=rXu09^p9k~>%lrEM5f&fGeXF8GC;hs+ZXzY*mnnl?9XE87urB*BJ;DYU_se>7>{{}VaZm(rqEoL?vMbdY@@0FS3NpWgwd;<@5uKpaPdh~TNocwZ>30?G( zMqf`C)$ZnZ@|WM6=-u3tUees9&tzL|M{cm5NtEbVVUqF?Kl6bc_c)2BoHNeQmezm%-G@G%f|m-XQ0^(N=>OQh^qmT#V}Ae`39?{W1M+U`|i+;3XxhwD_> zUB4*mx1mZ4i9+@UMCP=6jEF<`3c%dJHFVJQTWuWz-#QPLWzOeAmS3OM3c@~w`6~w9 z_NO5jWR7HP)kjL2Jx8)MGG|MD!4kiSgl$6(GDg+miV8xzOXhUNwKwHRcsEW=`;?yx zZ3^dSd&5i;RLs$_Oe)A5aztYVphBkgif!)u)Y*=2SuYnr>fi(7S#bJ&KyR}9w}})$ zT~=Jk2SO8H5b04cD;8t|Iif!CH;9UmG%E>fOn}I!DvT!NiIo-ye!zjrtsX4p9!mR> znaM?vyy*Ku!2gyPh@F90iwfQY@FLOTuhf_T3;@IcF3OvVs@4_2fYtu4GxPqbGy7h3 ziI3+$fBhMN7wgRa)NlGPLbLKixh?lhU#c{}F}RT$*tv$F{A#!&SwpqDVC4&UpMZ7M z!G57FAZrk7sm1jd1#x{g)uyV(H)D8hOm?SEeq(>nSW#k)4ZT39Z`^~U@ed;8~U@#f8Ry3)D)&<~Cs|@CxYYSQiZQrgnVB*a7WrK41S|$lePe>V< zlwU}U(`WnBlVs2~M$dB!@rB+NAMhxFccGV`d1SK@`qjkR%uM3i{z5<8vamN16)iIH zlN<%CMtsYMtRDZ@mNMl*y9(XA;dAYih1)fQrKOUECSC3ln7y-byMwCP-DgsRUERIX zd0Y8VI({_xhSF_xe+9#@ro*WZp{tuRkyy#QpAHx)whkr}9yQbr(` z2OKliQV1=D+r~2SY>4CzRAJSK3eX88&{b&ym{%X-VJ3j!5Ok36|3Ig}v71Sk$^spG zde-l)Zn_vbae@x&6HqOKr*kA)wQKp>KrEtnw4DbE8V(53W`^R;1fc&GWU3I)qUw_N z6n(!hndw2}$dK>?_{Q#oVo|8^FnHDf{7w4vcYZ(f|65}Ps!-53gzg&oNTF3a%_}=0 zD!n=7;X1#S;O3lLbR3v$2)E)UY1l86U)%D1PPwlHFunASokUCjYbr)L>}^u!h?>|% z8ihWEKIuN`*wTnik^>);n495H{GskDqoV>SeKQkvYy7X5*{$at5MR z*jDs*CK;i@k36@8?#_QiM5!OCasYdVAS-$zjfa82m{a{P<9=1}8Xu`FnDZL-RTDey zwrD{GSn#tv#aa-*);^k_lSHf=tKj9%L7%ceFap;;dK6URFda1UJZ{~|er3#7j526l zwoj9RSx2YZ8E%SP)@z!6jtGGQdX07x4(*Leb3-$HM1`1l5VwFe$B`7mN-aDx1j~_K z&{7t1o<8>q;WR3+Q}`7QHH~*u0zoj3h@51VGYL-Lnti^R27<w!Fa2sd`(X_-a z8Ehwjpov4Kdt=0|u8oHz(ivEb1)x>wA2wnREDo6dx1yN17eq#jMb2arP=|2y`e@?( z*-hTZuL>iT!9+9M5R*)IkanYY0>le|Rcs*H1n=u-yWa0`gBWu73oanpy!p@LYW_Ep zO(G)T(5tIczFhV;m5^l%=IirU-ke&wIWtql(+BnkLgq$ytV3~Gm`(I7xG^NOTqjKg zJj8hZeJ=CFXVr$aSDwj8yxjBl`S>}7-q+nzq6+;^RbI0RK#eLNE!UN6t={{bjI(l? zIu!ruyiU)Vy>U#0))4;kE1_()VrZ|$Wecov-fxfbpH1#2;S3^o)I^>m-oNZR^||h{ zXNpeGiR44NNBm>amr>c;$0ou2Id?^RE# zk7LSmli|iiR>3N{tjF>Tyi9{qD(q{Rwi-E`j(w_mUCCYQ6?pjC*`g$Yw=IK-=g>=~ z5>m)%O46qI8P5toN7I#_g?Fa?yX1^U)0FHn*X5TCoiLe=2$V{=LP)WU%elLBUBsYG zmHA{u46`xnZ5`u>@ox{bn>)PHj|c|M*i3w#s9SV)~G?zw2$ z!Xa1C)9yI|J_!Ou6bPTvh2m^SIvvfAd@Z)#>t4C0MmAC%!ELdB$2CS|6XA$?{9|u^Gp_;1kOwkc6=7rI5f;ZoSsoYG! zHJO*5Szxx9?kQ&{N0sJgsW%KKURFhXz*2(MCtEghM!9ZS>M6s~GxJ7W>gN!+F68Nv zvBvuj@@Jg~{b+%!c_rcQ>eFGTtcsm31l~!FKJ#KCNUdbjfLUv*w|2Jk^QEbMO&Ed; z>P?7?sRzCiYBoCB_Ef&THcOP5*OKvg`&y=(-_c*F@B&=?+eP;Rm#`jh*;mVv?Y_sj zmx2EnX!K`O0`B8G)wh z40#+^s*ldU=rj!8*+-e2^^b8NUY;l^-5Qv|Dj$7#52V^eV zD89^a9%!@zyp;mzN(R71dJUO*0%&P-tBVpW@1xFW>`Ux>kU@g5ij6o>S% zGz$QCpml5Y;1RHMfJo!RPzDQB_21>FTqfaXzs3)9`zs$!5B}B(g8wJLRi)|xuIk>O zH`FyqeSH(~+2@VLjlV(CsfS;?CNQ$A2PU3MA6c{A@H=<)W5|&>Clt3Cb7b}?ZlD(+ zXxE+7zuFXd-wenOK5TC<-4ySGdFWX#I+y`Mrt@fXwMK3dHA!!oP}W@Iw=O|Q;{4%; z1MIvmKl8O}G2NWz+V}mJ0nNN)+UITU^?aWz_t_44VTUoiM}b)3;%AfY(UAdh^T8&z z4aQB2yD=Z~8atO<6zxXy_jBxIFvs^redy+Gjsu<21iOm4B;Av5w`)lRM_$~rbd6L> zknY@tx`=Jd@d=GPrf^2oYIV}Zpd`JG;l=Cj3j(c{aM?n(|22LWUwJH8JB~J~tr#>- zD1Cs(XX4rnjP+ZMQ{b(fV4nQp+vN^C0K)YlmbN473GOjqibf{8{<9{apRJLFd>w*i znA*}QEUf9NKpx8@4E93xHr71WWN~ zb;t}b)oDny)N%|9TGA4oI|tgW08J$=PM|?|WeZutO%4mF&c`AO#=cru`I+h!5p2zW%*v zdvF{8_qi7x+Roz}F(FSMJQg6lT0jDVNa<(#vxtb{VzG9+g$TGEF5oTU?Ho}$C`&W_ zJJ2<~BPeNM?UjHrabv_)*;%USorFObPL!3O(o2E(0M zC1>JB+;+_PX{k)%1(pw33~YFT_R|S|?tQH*s#-b6*9PP6o(~)|Bw7z2yS?qi19 zmIe~8c-AsVb_%CF&zaILKSgj8IL$jAW$^iuiJeDHez04liWBG$Ghx)Dz+dZjlxbrQF$u}#{*4|@Iw^_lAE@kQJNQ(6KJ++VFmY~)U6kq{u_M5y+$)opG?*_nY&F| zTO@fY(#isRN&cVowncgCCTWx!ydtiv$XfBKvJ2iC){%poqD5Y zg#{3u#76dKmbo^fa}i@(nK zg8r52bXM42#d2WowMDntG%b_AfbcT0%?jbv++-waWV7BiBz9P~*FOx8<#${st2^-B zw(mB<9ymnMT^EsyuUx5>u7t0Nqy*dc>K)Aq-m3&ZVmxHq!{>w+3q6ZE;u_0Ug4Y~3 zX~~?dE55$Jhd}9siwZGnIem&t#oKAcWGymnp}7A7auijxu(nqtya`kfpuGebGX&-y zeDn7JNEX9dEIChKNQa6>)1l0!w7CwW{h5hEaAL_y>i#b; z0t`eS4jsD~=O|;dXkC~Zmt}c9@PgYjhk6&2q0@!k4mP}XyXVE-IOV0CJUtVr9@R`; z+}x2|zgV!dF<4(2nKtTQ>&pZ01E-{-2!;;#Gkx3uvF-;x8r2h>%kQ%H0`y}cYEu(>rXv;MzW4n6R?!ZJ2Owkjw zx?XuH;+YnxCrZ)Inig>$Xw(z=91b!?E>+55o^;lNb4x7UyX%;lUDfHmgqj7Dns5bl zJX;X7Y*6L4shXa?gRV+Qy^vxZBkYVQVF+rGYlLbZZ>SQenqTd!TM8_wfD-*A)>PpzR^N_iz8GQe^1#+3W zU`Lok-8#Khmy5JGvE}RJ?>osZq?Lorf~sPEbBk;3R(+^vRWex08rnSIqoV1`M7% z4e5o2m6*Qt&9{*9$0-tf;`Yi}phlBCCUPB#USKvtvWAUL%1(*oxPP;lQOVD?*@MwD*-frXg|2!!=6JHDYc}b_*Iy8!Z$L_ zXRyyvELx-B-GE@t8mqgoIE{vZSS@5t$eMr>pw;1aUy`Vt-UddcP(86r4|;(-T>{fm z%KtqvbtIJ!Kmtidun2B{P(E-(U%UQ;>?CT86*SX4|8cc?-y5cbL}@+UGGv=BF~Ola zU1P--QkRy%ucoh`d0uYIJv3~d`m8E1Lf}#(%F!X5ufK2@nbzbd^69La0vk zGU2s7Xm77$G~-93K$nuV7z>CDI~{S(CPu@C{GilUt*LtFaR12d@S@oR%MepR!hr4X zvrR6oce$B+^&XQ-$j?IY1R5Oq;=e)L3j+#u7n(S2K1c4_vu(rH=CV;vN*~RI)~G1S zpYn1LyH}x4eCn>MshuI~4Lv25S0;f5Ihz)zNm4vm{1HM|R%cjlB= zr(*63YRkukH-&nc0&2#L$KxwUV~%V$Cb4t6Gl%O0F5P6>mEErII(F0>mem@=-G*J& z%Z^O@h0!!G;T0NJ?C&$${8BUL(R*ICWOazPx=41hOPu9lYDAx&C+4V~rrP?>hqbxZ z!zJ5xqcl*(zF(fNv2{yo16$pfB;hZxCH>6}R|p8^JZ(ic9B&+>a5BDZ+jw{7%cISL z;ITr8jQpIEF%eG*(Q9KyUF{qkR<+-CF_Y3J+?OYpj6JD$D0OStLj{qm~<=wqMIdj91o;OaVVj#91y!fL`OjTDWTB}92$yF2Rs7I zYMT@fUor&1fr#kXC)vGJrjV>bQ4Mz%sf6@KC;NDLaEck1*cfYslgz>=Ww{i%xHtO$IrW*%QdSVhuX^Yk*%41 z2lSkZo+Skd!_P$=_l~QNTi-TjTg;Sl_YhBzqn_;;U~Unt4V{KrXwvx}47Hkg%^k~mACCR<_srCW zHTWz&rbsrvCQch)oM4rB^J1~b5-AF56dfczppx$vKf=)|2XP%pY^m~FI?Wwn6L4c z=kUJzjgN4VynG=6uMhWFgOSO1U0Xo0F^oCFeW$u_iJuJL4x>UCKS{262(uS(AGx8^0UB8k0`E*uFkqwsu`n z`CHw(L#q*<2-rLX<0n3S+f1kfD;-b2LiqaIb`Mt@BTio^Xv|2Txb%cPb-+Umc? zwY*X_$StyeT0U?4ir|r}Zl2-bG(8 z1QSO5qL&u+q>X+&cX(Z|zsZObELEAXAWOX}eilF0lC#lsJM_Hyse=u!0$*no87{%8 zZE$cZQH|sl7x1Foz#h^xYcNft^yi$KzrfJ{*};Mi0n20i&X4{SO(B=9UvWJkf1hQ*ul&y@{bx4MioO&C%!-~w z-3t7S0^eyZ+%8YpWU!*0_C&tO+WT?vBQq4dEQy`Lug*St z?WoQ}I6aBnQQi;FDi%wuZ<5hIWGC~mC}pncmOxc6k$$uu*PQ95P?{+SVhE{)=WIxa zjjBCYu4khCbhU!L(#6W7mT%)}Zt#UmflyE+0#GbKOR9k=kUxc!#BiBXK&UhyKEjdF z1ZpEnqy`Ve&BP&d`T&=~BBujQ0!Mt*>Hmkl_YQ|E?)tqqqeltRO9+C|d(>#rBYFuE zy$sO_5>Z29qKz6N>gW+fPxM}+3xnt-5rQB@dw+7@=bYzx&N5=pQTKjg(C`r3E(0Cs)nvI(*|uZp^TSN)=SHj&2L)a%6Feh8&yO{&0tT_ z>Z(mgd!8Zs3VPE6v4Jz=(*Z?-6x~kc-fOUMCAu$!m!nv1mC3`vz9yuT{ zJFvUbxHeTT84X#(SeRs_f;?s-UL2rT40L)TiyO=YasK# zTLXc<@ec{?|Nncc&Qoi;qi3Mm{~xV^JW)rjKyST(bccTDfCcB(ga5kVq!%UVZWY^S@+nvF8)2IZ)T zdIusJU9gqw!f!+a`_1FjN?5_NZv_Uh)UKAkkwzWb#X^?W?+4!G+;D}(ER}M@AL5E> z(-vW;2UKfCvKSKOS5CjQ*0NwJy!jEBjjRzJckP!q?@?HK+~tW$AEjdPFkrP1+`acz zTO$mg>01vvL)a0!r8lpacD+EnBTV)XG+L{D}HVb?{pb>rc| z3-rn2OWDyB3*^W$!6ByNZe1NU|Du>x;pS;nEGs&Oc1hBZu7I@adrhxegl@v1VsA|; zqkUF7@yXcTy9KP<9JzZDPYl+JmQWF-%0dLJ3a2t-HN_VY30%fLtve5rELT;~^+wMk zy3_TcJoAji7kQGZj5^qAfl8Z(WTy3;bHyJZDT5xzs4;|gBa^eZiJsn4Jm>&32W1tm z9Du9^@(x6qH4IvQ+nK(B`tB@9g0g@TE2Jqe8vJlPksdHQ0EqL%9$L5O zlPFK-`K79;6>7mP3MW2ZL28I->Iwa20Dub-SXh;L-9?P3aRHE2gD>(ERk_yPsm(1eL&|MT zw=ScvGJJdI^xHJi@?G_GJq90%)*>vN9RTJ$lV^^mR`6^n7fvTtmRA@O^Q-s_T1o6a zj@NPMB_U~ZN@*3=N|IQXvb#DtJ?cS49Bus%05?4s7Re>f^PgCfcmE8t{`E@Czgh|U zSG+#Jk_7e2jtBj7^j0-ksQ_30?@~o`9%cHy z@;|J=q$Hb~C`_udvEVy()TWBEsxo6+1s5CR#EHaA|6EwC>EJmLrpn_iozx$?fK01z zh{6+`ArGTR>^N~9fnte26XG53Piyz76?=TFfKR7Ad4mIGLw}6S`usLXD0_v7he69H{Cdb4W>Uh+>rsWY6?lj+6P)vEfyE zsVT7L_db#xEL@GQh@cYCMGaa;*f9!G$ZzgeQe_q<{A0|J?_GZjvCt#Z_nd}b z|CVq?yjz~k_TQ88(xH*{L`bp|@iq}y!tdHH6-JuL-s5&$m+aCaQV%n_DMTKxES_PW zRM^c?s1(7iyx^f5Q+4`>2#V2DOncXnR?W#;tln;&qx5pD7kILP{~Kpl8An&XGk@%5 z{-`C_rZN*)q!1!NwW5v!a(8e9FkoZHfQb$O3ZwwQ9}xT=7buw$iGUgnsKQ*jz4GHh zTa&KFlTmA+PHBL)W*}x^hX5t}3Xrpvus~@em`GX&SK@{&E+K@$y&IQ&SfnQg!@+h+ zq)kT=dx&I223_RB@!vmcaF>anH9RXIuDZuqd0wK7N*mm=(Z)%hO`{79iSb_;*~$Y6yD8F2cWwgIFV63>-DGK zV3vB<)GMYuP(5AC$78O*cfkb*xMNd%)_y)U<{?>LLWZh2?*waFcaIQBZCB*&PAB|0U?K*`bs{S#9@2 zy11A2UWC+_`MpdbPwd>uGy`M;DT&ajG(p1(dtqVetc;T0ZvA!xm8bD zgf$6243jg+#Zf9`lBnVMOFK{bWAr#^uu@;6mtRZ$7&S6L&D8Ll@>8^k4_)2>HFLvA z!*I5Sb@~~0smAu$ZuJ`wHCehCQSqoNgwYaG_TzqqyyAaX&;8q@cP`#vXBo$ z(ccFWx25z_2@8LzIIdY)nWd3w+#N|Xrqy4TlgQW4m2h%)2jI~rM@s&C&$y`S;}-P* z4@7q?|X3L0_I? zF4$>YLL*aNd>}Gg_=8dxgG5Def#ociazxch1JdE1)eIIf)cg2A?TQrZL4Qzv%fF!m+y9DCx8O`YY0Q^rhD>)LyU z1in|avkm*j{DNI&u}Q^I1^iA0_+v8d5Wu_u( zDw2h~z)o#+fG&Fax6k2n&4WKqGY?43Q?_S>AxnssT$U0LMT&xy8D5LO2xj!jB3Od~ zT3W1_tGOkXp)J#QyYdk_QTwD=wEbeZU0<;1lc=kn3QNf=9T6e_JT? zFKynSpsRz8Qoj^>LX-b#``!f16>v`KB6TFte}!yNI0n5W0D}PCr1Z@RW;f#bT5zqf z({K>4NIhHB;yzU!KBW+Bc+#(@z?H4hSW+hM168?N{_&02KBzF>eBqXY5u1|2T>@Zmm8y$p?<}k%3#BFsTW8XDP30qr>u}N!i`U+(_mLaYT8^eMnqV%4B1-)_ z>^Jh4*_}1;$uj{+gf#HIXW>%G*1#y7C~?xNDvm9$eRf*-n82ad?YLf4to=LsflX9$ zF|nt5fzTZdt}G)v&_-$xvYAWO`O^RLZ|=jt%=>bwJR9@v-ipPuc=nIQAZ@jY;{xhQ zd3F$c0V$dO_rY0gM|%4XLtpAwLhdI>(o&W9B;5BRPCL7jz1OHFsNRddJ-#Q%ef!|e za21tRLVwf?#*8GQsI*GYHbs#sLuJAbWjp$aKB+N}`yWCt8zIqPEC%%Jyll7BgiUU` z*ub{7cJjlM_dGdcl+Pjrr@ozOJ=iJ|ug9u3S%wuA=ZhPyYCR(OEyU!ZYtGtz-30c{ zpkO$KB2o9}=CFY_J2=(Nftvic~^91uijZ9$HL@mkG8Hik7%e#^uL@ z;}K9k*RPfWULk!vuSxX7*IAHR8=~#&4NRi`7j`|``O40i#MMyV7`Vc*e?dq6Z**6< z_y{Vl8fS-N7I&?R(ux}lS9G%B=o(3$j4=xrlWj~M`=}J_R8z24a8PWmj9^uqHBi7_ zN7gLZh~Y`C6%i(G<1GSDo^HvHT=ul*F$mGuoYs`J9O$c8h0@;onW{Vjqugn10vd}Z z)8M`ir7UZ&!wa&PQtyF4A^!RkpWL12x!j)2d>^FP=H=KyyjSY_bQ z>s(6v`rJo4Kbx<-t+_+WAfWJrfO=1Sn$bsN%|y4L>75hXfpnNgH?n(cf~#DTNL9j zd(lNQmEZ1So^e0w_TVaTxt;hr&ni*_{x|`Xc!Qnjv6V*^)|UP;j&Bh)OT$62j_ogE zq_CtFGCjQz(mO0%aS5J`G>Punw(adS5`Cw6i&OM%37vG>O-tZV)KDTGnFO2No<}*R zEFB*E7tOx%Z!|kNwm?+>uerT9TXH*%Oqn;ifcC}S=hwSMqRDXQuWd4)M^ujiK%8H} z_hu1B)t$VJA*^~?XKtlm^v&K2@lTDsP@j4B`A~a}Dw>$wJkH59XGv)=KXzfX(l!5} zfc$v8Br3orER9r_>oG=;Yv(lfI6;D(XgjwnJ2EAWM_#MHX>Zziva#K&JQ}0d%?aETF>-coKfKu9v!Jo>bSEyUo zP)!H&wX((9f`Xld`P2-v!?p9`3&_+K|X1u>EE@lVF5i)iSg~MpBVW zk0Hh}gNc+n2sp968UkR~x#)5kM@AFVEIDKyw^ST|s)6ZcYMnR1avGD?mRUxo zkaohajws|Y8c@>aQi5!8DYmyJB2+XD#YVfx+tG5nAFIWHJ6`~G41`B2(kTZudfN2 zR(tVP-PL!@GaA<76R1eF65|y|^+xmrhb|j~VSvj4#=MYR=4k&e5&w&u6D}))D}lUB zO}OJ^C%TsdBtC*V3t#Qxc}@`K6(Cvj{Z9tsq0t@!Zth$neGb(|Ho?xoP>e%AD+eVe zXfiHVqc>N92GA?lf*Wkd?pKcG$%fYn^@6OzKSo5t%lhJvjrG>F^ppU8S zer{4H*YyKL{7Am5#LBn;*HFK~1w^m#rS$TuoB*ezVqlD^rdd{z-B`1!;Quk|kfEij zx~%z{S53Q-!#_VzG!V>tiS*S)ZR(5+GA{OMp)_cTUQi}_y?_hJ4)SkHjmt&jm)^Eq zy~&5tTFPBZ+07etFW zkGpAF!8UlgALUr6YI=qqD~?vyvM;@6T$fw}P^=H1_&Z;&%F?c(9~u?I2F>nGpda)& zb*j?ZsOxD6uH~{(k=#!PF%IY#vyT~1T;Rfj(4zTBNZ~RL({TiF0XST~;AJmeeuC9g z;_KrJ=zS^~4;MycO3Z62C#GIc?ss>!q(S?DZAM!H?NUk7wSgXL#+a&~Y^ZN~?xZ3r zgxg%J3Xc_s1_>C7VoDezpayzvm*oS>;UdL@?!vG9Ac= zS}mab@)wi~R6{TiqX8I2pA}6hIToB#7JSnNwU!(-43pxPcUz~PMpQo*KaV+_hZtDu zw3!eLWt1lT;xUM$rIQ!KmUob!K4Oy6UmLVTR2(tuQ_t2r40o(u=oN+Pa2;J@7-Pi!p8q6it+)sR9$ zHT4plrK;Xi^iLdEUk$jJPW-U1<)#NBk+mbN^nq@0!-Uv(ZcIiJngE5{JzuCTCVBw{ zj2jBiqA3$~xK;2HbtLn;OQit=2vF_dBz(ZX$L(k{3(tFoGxtI<-F6}-EM!oW9}nMD zoqccI;iu6In`|pGwqbTGaJXdHzW7tJwfd%3q1O-7NNIB+Nr=b_6}fpg%ix#L^p%9~ zK~gYg(8!SI&Q4KIJ1FZ)uz)N}gZ~ozz!w>Ii;YUqPR@I+&_2neOv#{NHbZYg|DX{4 zG6P}r8uiPkNP#o0k)FDf8pj<`ZEYv(p<|?HWLMyh7Oh8%u@M2l7@kQmNqWY3Ji!btp}Q7F9`$x(^KzP;*`1b@Da#2lkRYNnpR zo+tK+)3sEV;@j$-(mBf4WH~bJ0u(9G_A`JgCst{0#G0v0l;fv9}l&{%F+$5#84~K@T6Gkb5)DR$YbSH4tf^B(lUQ^Cg#JbxsfgixsW#VfF-=k2 z2on?6;Ei|D5J2~^&`{#cfT}@1EECeGkE{96cT@H}AT%o!t^%9h^DUtzPljw;K z{L+0VpC<~vMSve-`(Cy{gGz5`FcDCmfgC-mg9}OMJ-)Q~Y0m0vN$3REGv7m`XqeHW z!+%ZYc|ue*8$;HG;V-l9bS#y^p@lR({gbu=6ut-#n%E3efT4kOx<`tne5uq;v}R*W zdL{GyiB+7sR=WuEzGnI4^}XKi3#AtQ$-eZCO+ahh&HQXpGYS0iYc=z6mG`1Ai~W0x zZC4WX*^`0w1Y9uULqSarmS{Us*E18VQJ9`q2LNf#*z7#wQ?EP>-mI}F=+9oOGJG`p zW`kVOH!mk9Db-4u1K|ke1Oo@654!Vjq{9j{^#Xh`k9#Y$n+C;$l|EGN&j{mhrX;pd zsqJW4Fdi%zN~(VLFcjtbAPDvzvZq3s3Fv2+R22Sy_a9yAj{o8tpqj*vlWM$19S!-JK;W9ZJm~*9&+oUbzmRkytNPUKeDf z*OvU1FsKtLj>4ocMj1SOsz>-z78kltxT)s@87jE9>{B))ZdQV5;OhRCHpD%_=H2>1 zSpYM`h^1yXyVdn{d{+?f!}*k2t8S;vL)0m2UP-hx{kvt_pN5ggFN>FsJ^Bpne~82g zsjIZi^*bTXiX~PQ{X`EBD5r?6GPr55)VnoMQ{h8TLhAK54W4(^>fc4%)TA-$U;U+q z@alFF`WVd0DQ9Z>v=BSB6XMfoU!}TQWFxs0%#v>DWjH(WDK`O`KS9CGCZr6{rAoCy z0BlLHDHr;?Bgl3D43OiJYw}OU2plb7+9inTFDLR~0|zlZ1mI*W|A1?A{Zxj4B4rPv zDG(`Lj^?`@qlM+A4B8EV=cNkDLJf~5yRMA-qQT_OD|H$BpjDz(!?Es4T}(uD$8XU@ z(tUGYe(h4a+f~7%dyY`=mU_abHo-k^FzQeexYjBjur{0GYS^^oJ5Z+8EZ#mCNNLH)|&%5kVqBqRR&|GDHIU zKGxp;?sh~1>Mm9emv^q-cAoBTkF31yh`c?0z;}!hw%(3JVxnS10%~>$2S;xr2|*$O zCAUXzp8D=qHsBd|>>eX*?DSL>i3Ai8-d;L(o=R>m?ryGjuHd=gCA6%(oQZ^mguu({ z5ecXu9)Xttzolew$Iiyh7W{#`cCHTKHH3wP|32mRZK8j^j$j2!Et17Zg66Bq062lUSmENmEfzxV`%L|4EQs;@%WSTGni4h$C;2M0X+8TdJb zLyk*vLr4MdnywW-s|TfU2r8R^O|hbtN^fL`UBucml#qy;hL(>0CI=@MoLf{({Fa2I zl+qn#6;(BL4SfScBV&{MrZ%>A_708+CogZG$G(34Pr{yuN4$6$iHuK3OiE69otl=D zi_Xh0C@d=eP+3)7Q(IU6v8}zMv#YzOw{LW8d}8v;)HG&sX?bOJZGGe0_uai;`v-@= zkBQd~g2!@P$W{zZ;#PddI_J7|laa<$3sSe|nYpSBkt*s-x61l-x} z%y{LthvDZJ$fLk<+)7H6G!IKY4lQh5jqvMb&i;cJUEudrrE1jMOzqwNrorA~giu>d zK|(Kkyt*W~v8O?MWLW2e86-RV9{um%HL;HCPHGFU3(4o1pk5l5E8r^O^OJrJrr;Ud zZ^rdAR%=F{XzWZz+B~HW`1Hz|AO|H(AYrGYqQJJH<^Q5f0CVNejzJ2&@rzMD=MEw1 zcG0J)mR0TzG?uqLX@hmO9dP4<$%ae|witFf4$sYAxmeF=NAA{OFkRHqg3v=KC9zjj5Dt7ocuEpoEk zvEAG7?Rjlt=bQ0W4@O>B!T&rytDGHbc29T+&j}p0Ag|giNdBc$F=1oNb)(@!wC+8H zSz-kS8S>t4&Y0JKj29ic-L4$1P2@JD@sXKC-%K;(DQZ%nwDCaM-!Kd|ufQi3ukV_e zmA`#+_ses!SlChg@||7W<^*4wftY%sAv(@SsM=k4>v|zeC>9QBU}MT*>`^s-hl~B! zZk3uyZ$6=!AHRhaaekBfIrK8bi-bI1mLQ(pGT1E!MY6vzqyObSk6SCf0sZf2Q4-eO zRZSNqo!Y9T92O~bJa;@gR3y|R&Y9w4^YaU6&)}5d0?vR|eGQh>_z&h@zHd^`Cd-PazkLjN=(Dxb>xT0*^e1YHJ1QRPWsCQCm~;UVAMeVY zhi9HaFKWJmWY~;*Lr+QShWKL5jrT0T+_%>$(!UdI+Y!gf< z0qx*9!WPPo9EKu+$i#aAt(=uzK#cFs+!iuBT<9nG1!UmQ*dHwk@?xZGdvW$qB7%(9 z)w&fjl%M$kBR&33`+0=hY1~Kfx;~|gnNfkzE`Q1OH)y~3>7TYOF_KI1^8?ncibW^` z5sG#qLeBDqVf^**xvE27ebcW>yflK+s=RvH#?j*oC9ocrPy#WMz~}(OBb%%Mm680~ z@vWsW0|M$9{TLR9;P)C)JEk&~c7(0HY$OrPnSngRZVduUTB>&@Y^I;uh$wNua35V~ z@`W_Y+urI7UK@r<3y<|$-t(A%Tz_Zn+5ovHqDc*ti}K*S5g3q(g{ zhwT%V>x^MP)bL-7C_DRUt6f0PLC3my^!i!m ziOL~^oHR(O1qR&T)T}treHIqkl&2Flp0L>InLm&YS_OwlEdcltM_)ktwdZ<4Cq7U> ziSjeU|MV??<8JNeK|7#jqC#%T_pIbBZ~X%5FF>sx2U?uCq#tKYbvHu(hCYGRN4`rJ z5Y@A!DrQnACMM*u(1{R+>gJ??I%4;nUwz??R z4(@!5c4Tr*rMc#!zo~>S+KOtfAx54~TGy7hdl*vR!1bRj%AZHj>`i^IF&4j7VKY!$ z9|=}VEIK17DwJ#(c?KUnbzXe%@SKNlajG~m&ymn+XHFw**`obTS;V>Un|m`}{3yAf z-Y30@WKuH3MQqVM_r``Z7#xYY=UbY?Yb~xdsPpp4O^ce5#0I&e{XI*$zr$*COyejidm#J%MZSfTF;|2N^7Gu9kGwpuVyH(v-m}gp0;eMDpttxBbi?HHe$OxxGD%?W6{aIe(I8d+1?y^ido}R64*q)Vt5d?ISbVkqr3a*c+QfWfhx~raf9|j2?yFak z7{ZEG+Hg)S8!y{ua?rSdQk>7k519|`BK+P4UG3|8){57pe3pMsq_9}G9bmO6vGhX4 z!J^g#o6_`Cxy~-x@!I|1soUS!nBi?uz`VYxnYOR+}>z!ijeWH%RPvP^)&>6S?+MZo2KInsJ%C z7my4lOogG>8ci9DO46YVk)i43gb`&ivu!M+sola6#^UPHlRuNw-~XhctY(h=w*9Q@OkLD6o5VMiN@kH`Aou9|pvs@#a}HVwK2_&;iaf5^ zscg*Pgzc60WpsZHmXj=%fh{xio4CF^qG)Y}_!^RBbr@K{c_~qOxqVYWh$mtM9Gb%mF@M4kKMOf;wbV~s>7hRvIr^s z_P6v5&xF6i4-~^)eI!}2{rfG;hi=;Ry}!yxjO{sXGwdA5yR#PNo&0gF8I7pr4^>9) z9ZEcii-$o2q{0M46kiTRm-B_Uo#Q zQM(6!lJdSANi{w<#u#o@A0*cAZS&>bw+WyirBlbpAwT%PIZ#2R3g37q6I_zQn!Umt+=Dz<<8fHlOnb_44kfoQ1dr?JEuAua#07D=hrPJ9^dS zyXCML&9SbNB(x@K$?IV&#IRybB~o2c7ljH*kY{!Y9Jnqx9$>dq13yT;S=ThSozc3T z_iV_WN_XN-NY8yLbxBUwC3TJl*x^u3DA_A&Jq2RjUJrCT`@l;qR7WM3JzEi`xL!7q zw5=)gh;M=Y$ZLT|iz*NKbnS6{V;bKKdulo9aCesB9ekLySFn0nd^6(gnT*|Tni(Jc zSF=dbeJv&&je+0Scw4th(uU)FG8Yhy%*}$j)UansZ&fyOp(F@M~XJ*nOVtM%Y~xpty<%22(VoaD2lMI^x4G3lP-jTD~8kmsDW6%z^v3qB{# zpxmv>!tbCO_H$?rAyy2`HjhIJ~JBaHW_u*(ImHISEM7LP+88#bd#dV)5GMMUssFeRsBxU!A!UF{ucSz zO{sgs0X4P29n#Al2T^@F`rVT9kolO|_~Q{WcC(cT+*`1b*n`T%*H)9qM9Ht4Qqk~vBxXI~rOJ3Zo?&c077fe@wR%^yYA4pv#B%+Y8(V;RlAj>H(?3I8Vup5$R4%3f< z#6LVjOEplp&fT8BnI(9&zTL{CH=?DxH$z2!9b%?G%56)#fWlqFn_uof-4a*vjV9^1 zsi*fGZ*P9Pj*hX2EzYawoyM8er`ll|k468|RdK1w@W5yWS~Nu`7fD$4_yKXlRnK?z zIOG<%0iH()xpq04E0k$wr5szthqw5jQ>3zVtz+GN)%Ez$?h*f17A$S_lxnH;VH?jD zPrK>dKbGEiHnI}&!pwS}d^6H(>Az6N}I&KS~tY7JllRw=1v)XdCol@hI5+hD5 zd4!vOuY@5H)-G-V2nbV*%4d}q{f#71wz_s{Mt|V?UDC z`1_d=W|pT^(d2Q|EywO9F&dCxS@k z1g8xBcJpH9ZE*D%uDpN}4x3Lso94%T$cT?%?be$0-gEfMwXA~;w{}x9USv^ncoWZ;& zS6k(xdH2_Ee+!ptzktEIxmtI+E)|b4)(_76W4dFfBFi9&d%3+d=Ceh;4ES?)X|gTC z)Y0Ro?}Xbu;}kv`LE#p*VVI6hn!iRrYmb zi^ID&e*cQlK1A^Z>!D9Eq86ikS|e4OG#LHEUw$R88{5u)=@iG-Tkt># zWBG>BE||058nOyQeZGau0{fMW{C)u~n)@xEKcA-vT6iCH4v@iTWIHM)w;LB4Va5$= z@#{KcTidH9Ap~XYP90qA>9*;%X~#5&gFC+ueLJ2q*)v%1&l`QP5Ox!}{b-jddT*WQ zbmmGZw+e~qK!!KyVCH={Nh1kS!g!b zR{=eUm!x{Bqej;v(MzRoF;dM;=6vhPy=nwsMy}Jj`EZY<{+K*lO?= z+T+Rk{JIEt%P?;gKr)-#%}lhf60^G0)L0kcR!Q7bL0$Wc(cPo^mbe-Vq`qKW?FVb8 zZX(`sO<3Hdo~MesL4TKgtw$ZfjFax_IT#fu64Fuz?H0uH+MT^`;3W2%ot;Kl>~ecY zPAr=hP2REB{RG1^Ng5(1wzb@%gZ>0I;hp8&=5)F(wd=F?mZ?LjJ?+6b>VCTm|Is?P z##vVRVn1=xk79SJ%3Jed$1Wv^-=-=hPxHT%``@bZl5N=CCi!s3`kgQ#KF)Q8PRO9F z_5!kVi>!Spx-&QVYU#>QmGeiV(fA(KRF6k0RI!68c9w>3U=gwGl@x(2WqCo?5Acpp z4c{NV=%2}Nw|#W0+^WggI{@BAIuX0J&i2G(5PDL!<&ib@J&3?fOd6b2W8RV3X&A#R zb^h_}_nyby8)at47S<*FhMYM|I~C5LJCe&TG70~T&ct{%xDGJ(VJZFcX1zfXhbLT6 zrcbt|3PM@Vj`_pQ3n)+5SZn9;+Lf~B=cMtIj``8vZ%q1C7K6Pb4ZpC}*>S5{6Ku&} zS3fa7Rd3pvh)7*5Bb}d}`kvDKE#(&VsHmqZc09Qf)~PE)lBIl9sNlhF=`*Sm7sPK} zzY9olJcIYXIU@8c{1#c7+UjGEs1-|1W_HW%Pa@bE*p*$wyaX-L4*oWEg;a-=r3IMpe+;+Z6e_RZf2er)tGn*~(VRwk z4@*?2Kfc&;?aq)!ruor}#YXXopSkhNIw5y!*drI^7a9uXr8T(tJ~6rsIi2xmktNF6 z$>AvE2bkfGcP5b7Tv%!mEfH%&Fw z;`q)d-^r+=#0rfxYk#D&<2Oskk<-E2#cO`|{q~JX*8>-)ZL>j@Z>PQaaeLia3+^D7 z!@ErAu&ZRQES=f3oU_+Xp2~Tw%ysU46u;}$r8dX->QuvB;yVG(Re83|oM1U!8QJ32 zGDmv*GW+kL-Zdltbc(TjHDEx5(DawD|A;b820XDm3r#u(X3run& zg_+jV>pi6`bqUt}Jmrv0i3>w|;mUKFnB0|S;#1 z5*zx~(}T4Ww1^kG)4ipS_!yD*^r=6whk4ar`121d$tH=AuzJ_hsw!5!%bY-Lb@P>P zPjFYuq7I}PWTpO0I<6p>miwQ+tg%*(!e)wLpuJnm#)z`Gc`ARF-=Mq1pzpBQZ_*(7 z=v&Qw)|8>TM-e!@;RY7|OU zO}(MsqUSy1HR1x!2)cC^$j`W|8KW|}sqsen$V03r*)87XL9WoFs6uvmQd*&2y&shC zNoV8f0@@GFcl!JimhoPXz0-EdV{`Ik`vm6@#<6>Rz-k;_u7(5u{DjE@6T67?gU<*u2DjW5r*31 zoB#1?d9%OheVJG@8K`4b#rN~gwadhG-+cYrQ=L6t>zUZ@)2>AFCb=N( zh3%Ae1ThSOy6?-{Cceth2OMj@5o+-~p+m4vdKSAlT**E>9IV)OvCW61XT>tg`6gn` zk^8S)9_z;PuSqJSUnFz-8Q8fN<4u_3u6sykks5c*SC)0iUK{WeopE{iRpRat|66wk z+Rr*{Tc|fM2}8*q7$*`GpcjRXViz7asBXmW)82nxokm@oYEa|F@F9|16TYa1 zvwiUB%wjPJBe!_|$=hX{zpM&DH!ATt4>E|WVEftixfq|0-+*{shVjY6+FZx$qQeyP zhE)}lqHhu@I!p+!MkwN->NXx%v=`wxungmOHCL=9;pOfIKe)H|F3mnucJ>AVR(jI& zh@KUf)L7aTGWWpdBUJw8pXN}p2Xbxn&rMxx-Cr@%=t(Nj@_#zFGvd~8gF@oeSi*XC z$wIDEgIvRZ_X6rXr$jPW-4}mdo1F67=)^(DEtV*D_`R*2H6C3y{*@@>;#21%;F>4u zZ>#w@bZBvynfW9+P#Yuj_EWs-+$U#;Z5!I3lfk#0E}#g5-R?8ofLj+(2&g6Ord6ZQ z2b!;XM;i_4jU8l?v_EwaHdo+$EAsbDr3DF@y9!Sh__g{As0idB-1BGou}99Cf8vAo zN-rRcBka4G%G**a=bk0xN=~W`Qi+vkIqsdICgc@r;4ti6Hb_S8C^!A}_GEx8TGh+< z(B+=<6@%_n(NA_C7AHpcX&3+{==ViRys5pUf7j;)nKtJy#1AHrGA_f`FXZH zZl>YIVB$*F3hzr~OH>#O8%mz>te}kzIys_H&gB?DYm(_>9DQ8!+R2TeAj zuWu+uFh#Nl;|Z&NX5N_y1O~t?#=7N`+uk#c*yq`h^HnQ`7!nRhx%kz09wS>jwRCta6ELab?T7Ma=6&D}EHBP-{ig16#)#_At~WXw_nULYTFr;t^D zAS%RF`$53=c6O-aXJjDj5h}>VV+qdp+~`x&po!`me!p8?R_dWqwqqfg!Yy(4g8dQ_ym%&1w_Lq&8T`IepGCW*QFJcI&@kUk z_5E*oSP49O5E743lw$q~Z^az7Li;xo1Ko{kBLbA)`B@mo>g^ondKG-l$4`YN&1Q$W zSdAnm(HY?|dXhkap#gV~j2DAmj&yuwsY(*a*5+q8G2UB`zJX3eDQ_YT`9)UO;_m;=LODQY2meJN^EYjq z#S#I|&r`1a?AE``h2O)ZL9kD&bH5x<%fl2n5q2j zzQAa^V4UQ)d06tXE@J6RtCYah?|_Q=@5I;T^GMag0}U-Zu4+oO?KLbwPX)RKUOn+` zm!)O5<{2WNyCMGg&Q30Btv}|DWmq@vOBNgA{vCnCq~C{ShnX|3K~)VutIoiIBRx3x zX<$Bmd+zg(b(xW3Cu2SX53CjKXod< z`^0FS!+k+;Zbi7Zzd;p@Wj_BfDBQfo(tG z)3ZL1Jk>ArsR%CbEj1r#d`cD zU0|SN$HWDnM4sY&>H;FCK(Orar-)`}`P5Gk5NEy3&Uqz?6~XY7m^@PepCzQ^gfT!N zbXg^{RqE}4?i}Wmc0NyBzT+D{b|>+uB$uIHUZr&68{NIWz9L^AXo2vya`y8U2ya(2 zRBb?bd+C-LL3qi&9lvGdrHMO0`azlZ9!OXuiP4?}ELWAG!1w@-9gf)DYwbZ(z>I(~ znzJ{1xmpv)j_S#Z0-A5%Kpjhz(9h7@rk~Cd8?0;f`7$aOk6psbk?MA7w`_-S6IEK( zX?uu0zCZb*uyX+&7Wwfh*T#CMy4pqy)?%y+{RxQhj#5KcPJi4tAq{5Pd3^s3~N`k{~N21`(t1|KGR%?tDbr8Rqukr4yC62c{>utB_ z!^cOM^t}D{H;ovge%VA7vK%|9@p=kMd|t{r4vJ}Z(kA`3Gi!EqyKCLJk(t-`7J+5W zD&c(CNf%}cS0mruy*)J0aQDlrgIGqt_+_(Q-0ru&0(~#*$y;eTA9qd$M5WV2Dp*zZ&JfPlT<|>}AkhEsB;IB|XUzGP z)2EX%gweMi&`5m^{%VSL6B4;!h-v=FmtLsQaVI2B(0Hu}-)DJf+I0JYy=E-z`=k(J zOj>U0fEuM$zD)FiyFSjr_bU`kNzu(w9Y_^8ppJGH{GK7xSTUYF*RPz9DUqQdk1l1a z(`#Xxj&&uaVUH@>-fza-TCCoBZcHQ9`8ktk-KTmmYLL|u)f@f5zst`CT&8;7Y15pK zos*2_+V1SrU=Dhfy}2SV#L2Bj{&+j`bc+O%Fj;r3pN6{@9nHrwZ{$u$5ORNT8=nO& zO3!Smwtf76*!%Kus=Kf6W1h#3F+%2fI?6m$W-QAh-AtTMMt7Z zMJmax(o9G~$WWwr?Q`n>{hoJvp6Ac^y59S`&K<`&`@8qrYkk&dt^NJ3rM7Z4aK+MW z@+n_K{l<-RPjfSQEbE_sOySi|3v=f^d1UxtUtB=V@`;YzS>mtVdvE1>Vs43r=tFjo z89)jiJI%f;{Ow}KY^+f1-~pMT=DoOE^?Ns;E*kJf;CI}V6Vw+iFY`@|m}{5h`iiIr z9ikp7df&FOZR)shUwaQOc%kZE-K&b4!0G@Vo)EcGI{R#+gS?bRM~#kJVqy#Tzcx62 zSgrn(vQ?laF0@mI)|BHk<27w|1K}ihnYE=A$6NXPvd7zhHmA89d)~XEV_0E#ME$`P zc=VRI`smzLcgt3}+#kDRL-yO5RHFH;m#fe#o*UNP;S%-! z^eTZS_a>U+Eov{>kI#D?tqcvjSid>o&U$RdKCjpM?y$Eny~p*Yto=p|Z!9kH2vFSb z70tN$BY$l5(R+@QhD&mBpSV6wF7~b*Q4l(zs8x~Ur?@0A`_X84T=r)CaR(?cDCqv5 zM8dIAQKAk>1`AaQb??Nk244Qcy&8H&pT|4H&=bo^dz4khFh*}YZq(`i=xXBfukF2- z3zTkO_+#MaEjjj6PT4!Gol{I7nLMW`Wy?A|ut{;!DCgsgjP0#Ak9=-f4-BXFiK*f9 zNJ(r6eey(r(;%EzK5LQy4&*Zan7mNeU6!_is}<{evuYk(x$g2*!|hDQo{-F=yRK;` zC1e`hU)ybUV`P)*w0(SFlXkOj zq=fBTO5bfS)~nU>bqOA7*D`ixeqfV2eMneZNV@yz>sae@^@~K~PDR?2W&^}&oTZR$ z%Zphb#$bhy>8gRvOWSOX-Y(?|b9D72s)SOkIBl41xPEm1vECn^s<=|VR9c`%>sEJy zqvX+DrM2>P&9(A}JWe}f`CIZ2^zY2!dG%=hW@awNlpWL9&2j$zM|1Y1gp_Y*#)Hq^ z6+9lHx5)Kv@yD6<9li{WMaBJ3h=d=fBNi`*h2*`_E$r9xS3IMcUe|rR+TGt)mcagQ zpULxwm^oVEnGBOpQ%PMG@h4vZ5`W-HzrF7m%j89cq|2hSc~f@jWwDH(#Eew~O3)BZs@pMG5G{u-Ur;)7&fK2$?N3B* zisX#3NLsfIsJZHxTjf8wbG<-8IL9wATa?x=k)9N=eYtu@s98MK*~NKn<-DcePek?q%3*PYh&HaS>=XSYSZTK$!^QT+PGR8Loa zi(c2QAdTl5SDWK}@6pU`qh1RMu8cZ8|NKt2Z`-H%<1(EwnAUpZ$$#7RSHb|8kA-(WLpv(soO;1uoH4YMN~IABtHw$LO<(X_1)A&lsCJ75Z4s67CDcFpL#5 z886Hy1rh6}bq6Z~j@HSIG-Wu6jAsdW3$R~d6aL`Mel!jjH0OJ}t?y*xB#mj(F_K1KHGK&nc_irMA(@M?P6fCZ8TP z+#c6fhWzU%d(HQ=;@_tBHk%$bPwYw zzHDQH^o9jqP|Ij=#qb38JkiDV;a0CBNecHVd>CIRXy~7(IPHnZK|MLjJn)XHe`_q{xRllV) z9xrPfZg^uA6M7b-7*FkYl=so?@yG}tIG@x$yaIWn0?Ezy_g3E3PR9}7gr2m7hg?$w z%@i5B4vE)iT=c6R;A76p!nL<0cG{ewPNt*M@ClM#$y}?uv|$N{QIYGkFKk;jpRXGi zt-S2tdA!8&?XCUSS6Ra{hA6&Y{tAf{G0BH|E6Z@0V*BFhy=I|}{NOEdudi-qcOJ<9 z=DQOA?P{Vf|L4tZ!^eIF9e%B+_3VV7r{D7*#gCXw*o*|*CU4t~@5FyS8N49tyug~v z^=yYziB(>{Or7v6Mv=$1#tu3^J>%7D*Yj8G){^K_HfVxN-@WPl)lw#zHNi>M9dMK5 zF~(Y{^z3t$SZn8rN3s0%EL8i3PU>744H%Vwb0a#v?nh=1--Olf`yUb+s0HA~Zi$IW zT{k+(*0wOb5i9zw?zM5fn65+5m7D27!v<1MKg894zNm43YwRJW-e))7)87?#n~^j> z_dxSzE9=61HkAOM+Z5d0xp4pf+oVSfkFS<*Fs<80MOvSJ_x=(0PEEt{iv2T@5#)}x~~P?qk?04*^Y~AY(U+% zUBy>p{+A3v?vFb6{ZXPTx!)p^;3U}O_TWkVRn6Fo!Tz#_)Z%>O>Gru2)*+c8nNn2K zwX*GcA8R+*eq;2n!YQ)0&_*;KvsR0T2fqK47n}YX&!s5ej(Z&6XGI^WxK-wkI8rdz zaXwKO$@5-|qCrU)2_(E#S&XIflO~cyaE1^z3Wtpn74c zGhso``)?h6e6^?cXWfU(v!aI&SMDy`FfRPm&Gv+~p|K$H5jInjO3wM9R5-J%Xav1n z`!kf#1+R&DQu;@`+wOMuYw!Em6Foh9OefA{sKJIg zovZk3m}aRkw}|rv?}UpX`&@owI^I)%=6HLfW~_L-hP+UBA^n`AfB(Zk*$D}E;pMGS zkO}gms=d{k_3N~5a>p!kD+M4dhl1Nu1?&8K#$0e=a^p~*QB!0o_TUSd)c~9VMuiK z@SVuGsQA{`OVU3!|J!$z!&Aur>Ns{T*Ua{X7I z+%gXJIR3wXbnBl7vsC}%vs*^!`LI7vZ~g0Ot*8sL`wu6~$gpReGSS~9CJZOd)(j@q zr%bA;#q4*9br=NawBP2X*LnSRWo~6XY;PgMGiIUrgxNO!G_~PXp$!M#NY?JaE=~#G z1ND7$!PMhdcc%Q%zU@-a_KwQy@ykZhw5yKtbe{v<>z}wM6f=E)Ddp*2U(#_#SdYmr zK!>wxb66(yB>iH!y1ZKG5$CUUx!zIS2GMmNdSo899E@roiq)*>VqEQ!(L3eK(eb7E zW9ZLWF^1W>nWdW4jOn8ASnBuYnLRE`%sTJ6XQ}y``u)|{xu%Y|EgEfkH!t~kHe$1< z-bB?PT6e?w`S+ZU7O%$dn!bSBk=Z{*#FXO)X$87Pq=YX80Bi|0M@g;6}p2l%@Z1JVGV^?|6GTwEaLFGiE{?Fl=V8Js)j#DES zYqsyavsJIeGP6BS@aS0FrpwRZ)E$Y>0yl1+`09_>VSs6_&nrdjbnpJKzkd(=`={qt zA4YDe*qZmd#+Sc;7rV;CI~$oURu5a?Jx|*1d4XgiJN4GgA8}Ks<|R$?ABhC>e*H#` zV?TuxR%=}CU}Nv-5ZHJT8fAxL@Aws`bzVrs`zowrWq{0}B){ke_>6mlh=Z*<6?dr!})sy>`3%`Y%8zcQ@DF<3L1 z{G?T`-s8paK4}Gdm76+SwU!5Oz`uKUtZ;V$VNyM-_+_eB zn{6Vm=axJnIc|l*&>p3I#Qrg+BQ5(j+J~pJi+MzXwn=kT$0Q2xW911ejZGJx+Snho zYugs#aHs9j>)Zul+>>3q?X@-AId4$Bcq6CluKk0bVc>+q4r2%59>ry)g32NZm!cte z>=W0%FBzQrF=}1Z{h?Av^gSJMHVM_8r|w-imo7;BCVRikCd?)ES-MLS^(Ap_Ngbk; z7w0_;^^@@gPlbRgwg-g7{zYEjw;yiB?a!-qny__D7&T@nb8f4#Z7$uFNwlHalumxv zKU-+9GiUaPKQ2e5Qp(zsh5ghMqeB-O=%Tea_BceTy=u>y-Q=PP_8y5i)?s(O^-IxN z{aMjbp4_ZyReSpECZ&Z|fx;Ax^VysoLWb)u-X;f9ge6yE)sMD}kL=s;VZJkTm?LTV zP=!j(x7{ig57_o48gmRKHVU+zI9M1U*dI7kohe*9F5^?-c;)Is()!6OW&t`gHf`C$ci!FmG|hx% zpwx|6pcDAuW>uGS!u-)Dm3d(Ln>Vq_tW51@PI6O$w4P7gCN8ud9)&Z63}>E)Iy7c- zhc&m@jR(DqXp0t(?zkiVDD8*$4$D5_Rj$N6!A)`vzIxHz_s#OEk3T%IAUUP@RZ>8! zpFa1s_MwzK`dwX@yFXnCKZXa|#6zDr{?@+kQs01Z9L)tuylu_$Eq?Zd#up_tBQh#4 z9-2OG9~4hA(=Jkhf&?q3UzM(DH#k1=8e{dQI&QM`Gd!`cfo;%i=TY zXY$PSrag8fE?$Za7#vtpk>0MGlgJuiyY!OIp~CdiFdhHiC7Q$|!^y{#RJ$qStVXiA zmv&iHO*Nk^k)P#};#;++G01cxVj5*5YO@5{y&337Zivh8HCvRm9aj&w3J99{ELER( ze@Op^KhKYK_3_L->RqPp3y=3b_TqeR^*vII-bq|~?5=+1Rvva;(elpETmoJUxs8}{ zPeG4jS(94o#<4)3&MW+{QWWIrqy+frsWAGUzbAcyA|EiE$f>a2A!C+Te&**%1_dQP zrM-I`CAWFcxHfX#&Je3Va$U23CavVj5uaAsmu6eKj22H={McTS^o-^;dy9^jxrmO` zEvCKfyvpC&^Eiwp995gNw$ooZD#pL2oh-{CZ*I8fYR`|NEenARqG6&p_pHg-S~47S zNiSZLlW>|SRH0C;c2zd^%4}TJ*gLR8FoCA^srsjvt++us^$yv+nOjfb{d1I$AFp*4 zrJsIvIh)41$*IhH;o2U&ZP<@GVx-Gk&oq_w-7;kb-DwlrZ49d!yG)-+?%aL2;ZdLv z$0rM+iu%m&d0K>=Uc<&2XEmV@PYB{v*MrMD9>1>_4w*iGH<8icGrRJrlC*Tij^nwf zEn(ALR;i+`+%;(GHZq(0YN9r6`oz&j0VXbkREkGjdpW#SRpi2Z zuU_%>5&7mx5koh;#s4z5oEL~q~d z{U*G}66w4B@|_H%j%&WsAEnP>sEO}yoHr?N(Ac&?^kExzeT&r|+>_wEH|vwCff0Mu zOM>#N9iQFqJ~MdcIgnp2o<7=lcrZbUw`f2@aH*8BQ?_!WN`Yk zs{7*O^r5eACN#K3$pJZP2W73 zU*&YIAius#QI5G>q~Lh|S5;N|2o7F3rM_)fPoKXld;HVLM`3SW6IseHE#eMl!B5&P z1x3ydi}4NmEbg;e(4T#6m)!aEU3WkW)`)8RaAt{JTZyTIUp)P`I@oI;4V3$paWF#u ziqndx?bD$qza-7|Oat~vUT%vvk4v)kcJjp;f+ixj!ZRFi&C#{-YVOeEU$U*Vl$?~- z(7z$8xHju2RFNTP9K%i3Cz{OliMn62z;ENt`OkURrd$HIavpywWBV)nQjx!JRNMeV zc7WTsu$oh{3bUp9Z7CX_dn>PGD)nhcJ;fAAmfjWzWGmG2PNe+E00vG-UE9WWZ1#=clY7xuF_YI}JkKUDJ$U7X!_BxpM zMzyG5UdHA3xiFTM93gQ2oa=7NWZ}#mDs7SlI<(9V&Po$Co0heM8`f@)sX}f~U ztBK26D{-j)BqYIos&9y3ENb9)a_@x9AJ*ta+Ig@Yy9r(Fzmh1(dH zEaGL-PoDPS@e->Xq!l@`bZ*NhTV`EzMK59f(iKCd5KrmX_qAH;0?Wz(l z@;$%&;?U+kvuA7aOp0Ub4{M=+mC}wHeC3?=tpTs+H`W6sL?5}TYqZw19IX1>lE(7V zLV%XpLse~|qS%6mhq0r&K#adSM)}-W_Vafu$2;ZbGK7!peRcnt?5oV2K+&na949^s z8SdHdByY>|Sgpw!)8G6m_Q)WP=dk&^!a4V^+DF9CI#WG$Wb3%0QDvNX&zY;aNs5o* zRh7(>QKs<@?75OQ#y9)&^g<5}+;0dKE!#q^Rq)+oK<%(%PKe!^OPW8q-v)>dVIVkF zJGkFQ_88~qtyS6fcPDPFEt zOA%5Dc5K7m``-r@K}$5#&|i(VoULl_**6O77t;hxj(jq_ zDa$HtJeR9ex!r(}>p^CU`LO(^+Fs}VjeSl%RLfBpgs(l% z(J19_-uX!=cl+Zo##-61;zg(L&tnF;?9Hw`HyE-`7w+pDXFM7kZ1wKOq)Gwn4=($H zv@6lC-+HmJ@99ax!i`7XZ%xkc5jpV6z0}s@CBfm|3;9EP&%II1e|oo0q`&fnpL6I5 zfzOxM_(kuQ=RbU#%W*aLxZZD&-oSzjl6=7^>f&MC~N>B|j?YbL8b>*=SZ*8QCuI?>7QTxalP|*0;AWg@4=tjH|PS*78Cy0 zx-aDNDkxc?tg5E=Z#e`M+@rTuHPn^=$t>*n^Cl`1V-yUPT>k!`#;A<`hAO$h+dCOm z$hiSX11MU19D~v>LFl9CS7$5FfJ0v7Z<5N(s57c)kiSa$RfAlL#+LM9!>D7%cE?cJ z8FV3oBS-x_lo6k{UYd zD%uLFI2BUG8wLgjW+rA{78YI=5djgE|Mq{RiZ?V)^iT=r|5v<$NXnD9oXtFb({H2`|+nj?qjoHS(kIg9p8QYX@~Uv~1Z?@wZr<4>Wdw%VYS;l1wdj^%p`5 zJHL5D2|tPxHimpCTaLvLQ3pF0GZ!F_;Rj})e|q8x{TvR^PW zLjjmKmH^i^#I{5A7k&5;)L#Fe(z7m59L$gpYOF#}8(pYU=wfz&K(0n-1ZBsdM!JnY zt+W)>Gqma;X$k|qYH-3%nZ&6 zGKOq=pt38PczHc6^>2d$=%>RBYhwm^NReb{W2hiS0Ugvybs5nHQZY)yc&LPI4|LhF z(NDx{q6X-;(BzLao}HlxOr_9FI^M<@OJIgN(%(yWq1`|)TiXF!UR&FKJheD%+Cjwt zi+*Sc2qV}+H$Wt18>lI5UI_2fh|?zMQ-BQnC`?g5A1H5zkrx-NF2$e@Vx^QxM*h%_ zP!j~>Se8>Mb)}m-UzhlfNQC{g{pAO>*@VmD1q%%frK*tr_=6l_`wF%CK^60mV#3v! z;SW%A5ii9~j6`Z?D2{!E6f?ODuO|cAum;^eI?M1DeUt+O@MZlnJT)jFd9RT^V`#7u zP<|5nhQxysnb1H3qj*?f-+w57Y;Ib3fQahcBIQNBTJcaoy9_!QN%de!5^C)b!AMnBKpn?S+dg5aRc;OC`^$0kF49{4`iy=(;_?^))FX)0CW!(mB$J`I z0u->97e~qrdV|XU!q@St`ZN;${z!jemT+0};gB@UmQ-~Y)p+X%QCq`~Tfw%)&)VsVh-aR%n77&z)1L0K+W6PEaVD8)D-@`CLF*(aNxdcoz%QZqlYu zmtaj{P-RUnM@{@SUXHPj#TK~n!3F(^D zygrR&09hI&y+nQYcrTE&(0k37@_6TpkY!{4tHd!I%D zL82`T$PSv?rvyrK7CQW%qJV{iv5-`Z2xxN#XdPt@HQl93cY(NJ*R%7^Ga32QfGVJk8b}+k1tC{pkHIIa@NQ7Ku*6@g6C*WE z+CsFA@FI29vKh_z4rl`CN9$LLltLNc4|+sU7z+B4RVNLM6L1R|014V$FbE|8hszQ` z1+kbaSQP2ERP-B>6_k;xMyf~9kKJ}M2L~8oV@SmaS4Ni5T53R=@2qJdny0CL`5gqf zRE%z5YRP3eutD$P`Z3r50^^k>lf%ziYeAx4}R-T}7~ zQMxO}-(S_B)DyN}3RAKA_k-DK13sXMNk}^@aLiW#DXygQRD$_2jGaFNf9wL-I1k{wIa$b-y%pd{oMX)VwP_XjI*ow_xR|*|V*o=*3Pd>xYS#DfutzzW z@g4N?kIx2uhOGi@BwY~i$%FF++<8X=C7QA#ybJIIvhKp*AY%gYk1+WF9eD0ww-H$h z!}T^el-MHK~JYSz3^UmR-#%F03sUoCIO2v(D12J)}$1TRI}HL{S&KZAy<0x-E? zG|KSE<0Fi!eEhu}+$FIXKn`|P`VV4eXAHpxBYq)H2{JnlzK9Z`zNXD6GETIwdD+lD z#TRrS2JKAK6qvQ#CX4HBfwI{fIuwf>i~QLY?S1C0HaZ- zIv&uDLJTw&puu3R!Q3+ZkgCCG*3!^|iuQPA1^iC5-R3u|PXil8^51`TZjHPLVE`Z) z=X=}21eNeq#IoH)8P*=)e=8cMq8 z5@8a2rPT0EphB=}DIKWAOb>fh0L}UMDk@4n%nSDttZ8CzBcVKQ^ba##8<+$sBAJ+J z+7u$hDGZcX;!f*F@2Gb2mc43u*|?4{V3uX zE&9#Z29f;%4@9L}Nglx(tVrr66A&!$mayh>JdK1^r92O!E017hB53_p$h=7s1g!&p zN7AohHDvKvyY-sS~?7-X)TRuC4UIi52>%&LBvzQK?9l7N0tll z(h8%1X+o0%PZIiIq<;3lOCl|j23vr1cdtJTo7P{5b{v^8kZckUhoNY&X#2rFlQ*i7 zzqcV#MtE-$FrBxe-%y+q%4~y;8{-JA%g7F({H;E+2pCTaQ@UK1Gy?Nh1u>D}8maDo zd;#YMnVbe^52T!AJybYb+ew%PP#pxA2v0Hosvr;yX-^RIL6<`_EuV%NlGK@i@j;q| z$N>#v*i?(?yJV+SA&EnJ?2E-5Ln^}pYoa+CgBX*y^BsTxG6+hj%(K7=(0LfjOPbM8 z@+<;1I{#H(eh0ypiD_+M6RvhX8HiJlFb^)WqWq2$&olx6k`fSGt0wahWRX=1nK^4G zX=_c*!35!R&;25=#&kNs_-Ck1b<#I3KT&Hyd{xG{qvxBfCWNk zW-heyeBe>Z$_~nhF4PH(lo7d;e<2#$?rTJ{1_I82%Y(j4P~5mj1S|t-0+d!qmh;c6 z2dN2J(ukQ+fLyQ`Xx4<3)3gu`K%#i6`m8V#00LYVMk46~e69jDiZVC7&_9Z{11Sh> zFj_M55US>u5DdtRBUC`)6C)anr;qL{wn5*sG4X>%SoF~Z0JX>% z&MKYr_a}PyQB_y;z;%B@jb;f;A#j7rhJwHgSqZ?8Jb{}7LkBqSp~Tt_9D_y#urGH$ z^V7TfN@5^5<&dPl_T{7_l71w0w};B*c{AX>Q32Da$wMDRF&i_ zVC(+qF_J$C9#9Z14VzV#N&-VW3cpTqd0_}uNuvW)42B_K14~Bxk6z$Pft(>rfk*}6 z7osfJh~)4X5(7ZmK=}>8F(|_?CY}gVk)ogf<10-KsbDu?LMWIa5iB$eJ6Xs6h7^GL zXtD@6k^Z3<;Kd({>H#sA{9|9>pMYo~kWPb-lMtmG$H=0zWyTl+0^}Wu(5oI)LvY%( zbO07eQhyeJL~%X@mL$e@+ZGHPjD@6tWHO8#D#^acHr7Wu3FHjT2LT)!gQVAZl7=LJ zu1J-77}EYDCa_@?KEY;?puw&GVl<2n-W~L&EE@m{Xn+(&%;z2nOPC3QJbffzl0gy@ z31oA#7+9$&uOwq2$Q5u<0*vz=9=U!~Lt4OLozKYHOu_G23|LzNCQD0OVz8(B3t2E= zqX5{Ttp$Lyx_~;E`rbRR4^3)<;HVX0VwF8R@^SR;^$0KMV(o1iVJ+xKYh41t8Z>qW zIdV7XOousEUXQh-R>To6y6Ev^Tx=|7QPg-~X!L*-6H|#4pco)z0mvjTM*>ePRhBcF z2k;8E8-dJ+rxI(8il~cY`apc7Yzz_tIMRPM7d99mF25eE48BFuD=^E}OMl`cX|SC? z3J*JAipU1Jjlw^$LPiuiGq~J8w$6qb#C){Y4o4ds0Rtx`FFXn3#uTb1x1=;b9TDOp+Rp^53hR6uHGSWnR(gpyJ zOpODO3IX{AEF!CFDFBn;XHsuJe_dh>00GiW(x>4nv-(UWfGU5W3g{M6X@u!yvO=W5 z4*rn6)t%K9ntH+h?EN8?MK-nMI=|6hpCTFj$3%RYPn{o@PdK!#sLti-%U3U7zvesO zgIB6D{xa37CjBPR)wYwb0v8i-3Ncw4REfTzbaxHfa1;Uo)9{JM{sKpZP#ORP0zLpp zLMvFd;zuCR?ZCtQK`&sU6(kM@N?*}KJ%}kE1?rGVBDZ9oOM^WH#u8>~TBr&U0FsQY z?G{*@U)3ZUHnJ3$7$q64RgI39_R$;}7zi(GYApSt3s+7^{f;Zknbrs1X{-UWp^u0q z5J%~-m=7`2$EyZbAYdk$We*>$gChE;&2l*|*1p}v#Q=bd619DmR|v60NhW|}3kU^J z=#Jb2qOJhEBsUJKU(^FIO;R~X1xfL?mcrgABWK);#gx^(Hx5%H^zXaPq*cQ-RU`Y; z{(I@h_-KxoAq&rQOBwoxD$^jZr;ncuYSc~Ey_Pa6QI2=C>Kn%U@gpw`NK3W=WEzMl zg4;k|mXwi1Ktxt5m|Mo5nC7nvAk8u(v*Jp?Nr+@Xwuqa-2sB_~$eduASYN;j*g*Ib zawq;_mOTiZ|7ckc=o^WSBBcWw^OuaEjj(}iD;d)N<~e{U>I3?r_eq9I(m0e4#m4?| z4ropg8^SOK9uWT!FhqVDx=P9|5FNm}VCwUr?5F`aj6zdJYF=e1Tv>rKQPARw%>@#2 zlF^XIg>*_h>re!88bA0p9XsxGBtj1SxC%)$b52UjH3@K z!Ys`&su0+kNn@%|Y7so3J`HVlCcq-(CxDimhFFO*b70UC*rVz^h@wJ*9LnxSfX^b~ zClSwR(PxPOb(g;eX*-XV(o_MQP$rqas06bs0DBt1^fqHpGSf7KoIr4Ey2F0c4d z7h6!b{n=fq9^0IKhy6BVL~W0RFP1rN8r2y#bcRU1_O0n)=$b#`ZLKF@9Y2C>QW_#? zK#{*eJ0kNC()oac?~NvTG32~RWR#?&WHUuRmdy7cs{{Z5CPI24ARr^=pKLJ}3>%Su z@VcM}BsPm6qz63MpM>i_W~qQ;u^z;nd-%rwI1fX_Z^^D0SSegm1rcu_gnXn36XjOH zm=j4Mn$2=$sfV$ds;LDpDQ1(Wg(RTJiGEVlfI>H20&sS6j7tkw0YC+xEDpwpJOt7O zCQ~pt1;1JNKOgB3h6FJOPzDy#=d@_8MzMb0Ixf$X6>IK=M=caQ@v$HCS?+zg2Wa@-8Q3`AIf(l5ZIA+Q=tE2M?p zq@*$hhe#0(*@a1z>4bHZSBj8A^>;}#rlaxPNwY?VxI*xITa3*t*84<2lp<)evg5Jq zwvnaKBkQPtn2FRC%j8N@D)8$Ek|sP%Ds&q(RPc}gBHdb9A>tp$7$}d?!g)it8yx1& zEx6tQo)(ngxnGYmTWO3xayUhM>6(#ZKq^P70@**Lf}jB?qXsB2pa6JzR&)vI_KOxz zN8%O8SGc$YLNbLTkz(7j(5w0u+I6+NANzioUz~s=I611-ledg8->E3Giwr43D<{wA zSEjx_Q0gt+B>nX2i*3z#ZJ~oLBl(Ke0S8W+PcWptR}H(MSAT5iiReE2I;a`vuUzHW zX2HWRQHCxYq-Lf-;>*NR~_Tw zG;PXq?v&=E$mP&|39Gd``MI->txpbKdjD%}i4Usxo78jqtJ<3$=(TFT*83`ZPw#3~ zBs|QMQg@sKutRH0on>42h0c#CxDx|NUIu;B#>D! zWP)6fM#B}*)qBJRKcn+E#J~*TUC0QlL+lGqSJ|%$??SDpJ5*dPkue0m(g04h{xX$= zD2;{^><}1|7EPwX(|>1&DZgkY)vJx>O#phs@{hS@K)!#F@4>?RfgI+ImZv&2R00EG z$y?7g@AL?An|gWdvWNY_5c5K6MF=>^Y?;=-|JQ=rGm&!}O@GghyTlUv~eY74L zd>Oh`=;=NkHtZ(h(T_uRzvc;0LMPnMgcybde!`Z@2qmF_A&rU&=0Fm3&y#*t z1z9%1wSn(rMWYG&XKV+%CQY=a+9VL~o14&J@3{n{?524k1w<2y zBcMtb12ES9{xp15l#>Mh;EynCFk-1`k_)=%B1Q2EAy@(vbVDvnz$3f^E&vCdtyU@z zj{U~?;`Xe6NdJwwUAmdOo#lScdN;n#GwcxWlx*%sx%*MjK7jsfk&J*8 zssk}6u`u9$Z5>z|Kq~FDfqd90?w`s zDY-NJrRFUlT0r?|eMBSi8l?tcLlDpem`Ob+SsjHH=us&!bhrR%_n~QI?V`QF)S0H>N z$Gpfca6km&84nD>ZXg~_1#PNfJ0QvgjZA$9a?xinf^!2V2m?_-iDFC0zM&wJhOd0v zcLD~08_t(-zB6FCu$?GsC@3RAmc>ftaw&B=j^FA5(Ekd(T$puY6zr|!Kz7D=I57n{ z=L7ss>!XQHOt7rHVtIx{SoEv#5q=cBv$j?NCRL_(q1Qw&yEtN!3MZCZ+6KLSTa<~m z%$g=WP+Ke#%Q$5_P?pWWCs;FHd35x^W$n=>mwN^23A$&d-}#X(95)|S9JgEDBm#5*;uqmoGSeVV;g^wuLh+-=R{}Xm$od~Lc6(ZsW?(XkV3IsjzBvy)e1CXQe>i5sLJg}F{S)$XP z(y5^kC-GDOV&uqRN6~E%h-;F#2seH&`AW#bz*dNe1C!h;xIl9hxw0eIM`V^NuSEAp zqz%}y6VdU2FgM&$0H^TM(h#2i35NmEH6Rg|2T9%mUo3K$pi_kuAUSYr8r_Nlvq5~|AS6uLpf$7GfGs*_$b|GRoSY(V z56L_9Nrau`J6rys4=hCjIhTGIjRXkrc_0dqy#u`-ML*M#PCPYjRZid6R{H#~x)Qk9 z@S|dUutMqeG4ovZl4Mb#nwoLr1igqyhI712YxcJVrZP`B9N%83<`rOlL)}!ln zc*UnA`P8xVCyR}(qw!wd{5xKR+d)^b=Z$r1U1oQAwi({WpAnPNKDXO7nl7Dqku%Fb zhW+{wl&%SZ&dw#vGUYB@GW%^tPl#5S<6B@Xy0@47G zXo;4I#;+ZjtCHQ3cV+h`TmVmI?aIk?P61*W+;lUCh#XJBvkMK~iNSD_uHBIsH zeJ^GFfh%3I^kGza!{%~Xw)h`;M1tq8g~`TDqDPKVUP8GUK^(IoAtL2pP%z! z*;=>od~hH&W7Ms&P}QZxYSeFcH_BLiCczcI>-40;D{RtG&JHk>o= zakzVX$5B4*rt#6s`d?^&aP`(U&=|z=n;v;R>?)8p{&TpF?{RnkhEdCClC$=aCd%VM z^xJZ8tSXBwJa~TV(|Wn~iwmv>0zd1!U#sqL746NRi%!4U6R8>2v7(&b-6}*ix&Ce} zceydbQBA6%T+JlwL0VyOv7&m0jR4snlTvs04uw9X{$As?=hduvQrbvN?U!BFEupUG z+@*qq>ir6;*uS3Zrj9%$Q9H(oPhq*QcQ2}Fv-OhDv!MbwiA2YDe54FGiS~y}_>)d@ zkvj#jfF`^8ZlnJ-y;9_yWF9BG)Y)k?2QLd9F#IFK+q{>4wn{o+EiTn0wutKF~WX~KE$u_zbVzO2BHJOduPnYsan24;THb8xUR)-W;effzsqkva-b(%PPDH2kcQnGySi=8!CxP< z%YOBz{C2CcWckt-v!*B5=X{y*^!r=uMQ!Ig%!*uchUJ5?jJI5}5)!p^zIZbf%;w-)AvzL-dtV`^vKRO^d=i237L*^(StP}@z(u<1pl7<221ImLO3Y;SX6C6k?B-xt@11(%-o19$?_r0zu*x5t&-;dW2Lf)B4hxkyzfv+?NCpoJwaNBcUr&2Nl6Eru(B zE%o;2NYkUDvO~V7e>!?Y{m3B|VhXS6Rt{prL%n2$!%7pM!w0_v+}I1nEVNjo8k}74 zhnHVf1+d?KUfRo8H5POKYk>R-Gs-MBq z6mYe8tz=p4MrAN-CI4}!ES^uh#5v=}<3nuol9wA_tyIt!Y(1ktF1+p8P2AU!vsoQu zO+jj(XB)OLrMePCB4v|lECUi{a=)kYUvu>r&pl0H($Z&D^LU1PC;v!~soXZ_r@4+ZU9C-&HTR*L2pU7WHS4*q!Kx+*pluhLMu zgKfR_Ndu)2{})GgF%H+)9oETlJ9$^N&ZrHWmilj#X>?8Px=|R(w`G)bo9goX+OXj& zc06Y9j`_~fL(fYk^BzI=c1Am3-8*>INGV{yP{)=hwZCo*^yjrRo8q##gR~B{at)lL zi}Q79Y`MGS7Fy-4b2t6mKtW`=^FzO%$vuRy*KrL;Q}W(sm}q>ko>PA%8amh+9XO|* zmzDBWpKe+0y5QTIq!Y#w`_6pZ^-NFMxlQMJe%-6`SFXD}tyK<)xSExzG1k_WH>ELF z<6Zpp)wU{Kes=!BuRI4Ct0_m$Jy}%cl$UQ-F7Yx4=@t9#|KhQo(C%oQa^rF9`aPF2 zO9i5Xj>2@qmE4!Q3lF(1Th&5y2Wc}RJKPUnYuWiT+E@%98 zRrl_f&_gVU?vhYKt&?Vbecf@V*QGw{mJhdEt%le-zx;*YVWrjcWW5t3IpeT=SY3h` zaV+_q!&2r=me~wH4kJsys*9o>i;sA7Prd0(D0wK!ReY%St#iQX(Gv!rdXpw)k2=s5A~;4&WV9RPT>RJ} zbl8cz`yB!-NDF2V+rtH?Y4|FrspVj1)RSp0&^b;s7n+CE&?aO;{j-+>7If z0U--BO8P(C-VW!H2noRnBX{Tn;VjW7livkyxdJ~YzgzoA;9EkX9kus~q1Mv$K8w4J zEsK>RrS_xG%!7QszzH9n@Rm2WgS*Hf*&ojed3#fc;~=#MVGt`s7-b2k;e^hbatyC( z;*TOHKVG;iANhSa2gSHo3QmKc1=@_I;#q)+#E@be#3>*$(;Y(297R3Gu#$N0Lh@hBT^ZN)%F^fEG-rrc@2ztL%BkzS9@A>g)K z?Aviax2tm(dB1z@X?muRhc)HL>ED07*1B9a-dJZ6P+TD;mVKz`#C5^fTV-^TU2*hq z54yz1TPQ|DH?acaP&E|!;5}qCfmtqPxW}koTXRhO`Fl4zi#6ugVK;Igbl!_rwf7tv z&$0{6eG}5NpP4P1$^&jsfTU;+4nn`xvg|}ZcDSFKbUw=|ed)S>mru?@Mb+_|dp=)V zN-H42Y8z4EGs=L2w1Dy zz$P9Th6{)bo4ERYnUW~8?YmjW%fr+6@ECuTkAl@R-DMula1fj_M3LS zRygE7XWYWKF;kScN8@4cly-Q%*o&Y;9O;=4s+I3-nbJ#{w=K!{ZVpV1R|1_dL6h)N7)=W8SX|+*_WxEz6biUy`4xD3A=9$>Xc+?Q3wIix^G( z!hN(pO|zmSoD%1j;5OU+*kR7yvel0;-YePdHSgM% z>MOa>@PM_soy7oGw@-XPSykgz)8T^=Su^{E&TLp66G>ONl;Q8aUK7~7W0OnTHOzea z(mP)rQ(Rf=TRYbi&YqkG+PYp8tpqOjnqICN!O^auyIu#sPd2yG4D3p@+>J|_;9)hQ zvktKp!1mg>J--|la{2AfymMjL3%nS)*zz>C=ZX88f)wWJ^Pk>2%Ne5*vQd}*;(PbH z`TPz48RP!H06YtfIi`;jBE`8FQ@YHV=6?C$GLqC&Otd}ekrCuki& z)zzj&&Z|m7Q(dLOo-cFR(146_L45bsqjdqg?sFTL+15oSdkKh1R!OK?$#{(ciGKsv<6OKDBkXT;lJDahJ{Kjm|zN zCkNL`>c0w@$rXq1b&5S7Pu*wA?O&=ACdsZu1EONa2CE+Unrl3ie4CDbZ=uHq*u>5d zMVA5(e9lIic|mz(%)hkZqPyPCq*@y6m~~Fs1JtU}n6ZTWje>y0x9?nT$|^J2x1Nm= z_(eai11o9}1LgyDq;v27>Wog@gzC&Ds95KMycT10QTCD~LzukJ!{WHnqL$3CMQ=`BwLdP_sH7j#V-rU2N_&esAHU zpXV|wY%Xtmd8&eC8J}wO`Uq*HmGFj7+ot;4&dWzEK z&rk;TIC>3=gc3s~GEnRZ%6&zRr$11^EmTbHJL5#x&V~x-G%%c{lI&CkV8nuK+Qwc0;gt%rs>di(-N4VeqXU+UK zl#U@#pq)L+2S7&|Isc>oP*a|C1Zq&)MZuxiD3u5`(4j^xu5D@Tk0?Iq8)_i*>_MqL zP5Dq97JjtiFzCDOkKz=1_;CBq*<+b=E*i;^+f84X0Ts<8U6lGl>RVdrNp7b<>a=q+ zrwR^jh%Mq6dcZ)NWl*f5`U+U$e|8#52SAMx{}YvhSa@+zEV<2p>0Xp;LFtcD`%shL zIBJmG3H?7w{sjR_$Uy)WV9R=p=6~vH< z^Cak0mlcV`c18Rp@bX6Mc(kixP3XJYC={3>`oCjADS4=aw)xM3L*bG;D3KBcIHJ7m z|C|^6|Dr6$EV1jv681ZZDD4F`WbY)v{)2uB4)soo9RqxmG@G=>-E}$YY?_>V0_$G$ zeoKLpkVP4GrLMGuj`A&}Dt)!$mQv|OB1bK(+^W$VNDlu_s~VkhSbr&u;Zy1Pb@TcK zTl@m004aRPaegN;TDa3& zcNRDMj)H}klj>~E@r1!!!kiTlO!xTi^2J*tz@$kAhhA4R3zR zYQTrE*JeD4QvxSWY&0eJj#vbYih@MQMvWxLRjO~58==5{c9SO1bSim1VSr1D9UKzX zIjjBn5AD?QOPI;zmYc?(U1*?>z9=Mj*OVTpB6+WTK}d6sc5Fyi(+SNt*}wDd_Ut3V zNam8?bKloq2C3Xp!dgq&na?=8>9At zMaJw5-Cx}lbyWZvl+J0ulow6@k$2iV^1@e&GJR-JxHq&CZ*J^1OXTsgU^H%r0bj4x~$UZ^2);!8pVuPi~A3{p0vU> z{_)`S=C2m(-IJ`Up>lqsJvTSyj=s;<)V|*70W<;RHU4O!vLMGtL;Raa1tPT!_#I*dHqdt^(n z%-CQ-qNx}#S|h`tw%%-Iz~om3o13Q?KXF}`7mL1|UJNoiz8rV)BxhNy^o9)!z28b} z2yx=(>|A!!_-No9gC+QC^mM~>rIBsKn8llvfyxrS4>v+vKEy2<-uZYu?pb5=@Z`;! ze<1YIpT57U)Mj6Iy9s}xQQ~`Hhl?^NN3aqc;;JD`1|~P^Mprz_V2V$wRx%5UG>h z%OZdSmW96DH}-;1lh9H<{G|r;lDhZ{)p%b;){(Jglsa^%N8Fr@B{{`DILmcj=<%E{ z=N3-tz0c@tOTjaW>Cpc9=YH8BXI|30f^XRg{S*OMt_z%Rh6fyTk`wF9)9k~$k}}?f zdn>b7P$}w(HAg5E`@5XvnDN?b2WxP9u!*FFAQUC)6v|%rEXxAYGzZ1qxSuGvRDK-d zLY_Y-c{~vw5gXFFiKdDmMCW^XgjpwlB4+5&iypMvjm&Xivm}V&RAekj8h^-#Kk z9uOmuoF-Ant^QNbKDd?kl@`D2aPO61p0m3asco^ZYLat&kP*;ZzN&ETCQF}q4a+V} zV&3X6*mkv0_p(u7RXC{|`1?&cjKlOmwFw)&$ucQ^MiPwN@BnUjJImfHoEZIxQuPH# z@Ipugh%yA$TH}Sc>?kDPfSTyf%Nb|pWCe!mr*7g}&A#9|%}8e#P+8ZeL_S^nRNie= z`UXII7A+?O_1voLxwi(q1abMl(tw}-2=^+&4F4?snP=x+UiGUQ#!RLBD7jn%Z2#tT zjH;T1HMJs*^-KooGU9^^1jhI9=jPcB1R;C#sPY1Q@E!VOtp=2tU+v`U55<-rm+)+v z4UuQ6nIcS~DCdD`fcg&^(LN_;dW8`nd%3hHbf->fq)iP++nIBPKY8Nhc*hK!54L(P zf5Iyo+|m7Q3dS}6 zi+HfXkh&moyjTapJ{ZD>xlDfxip#%v)_8Bl%XFi}>bVpE>b+SesFh&rOeiL2R65Fb z3;vs^#CJ6Kr6Tzed*OtXi+Z%BY2Q?@c-}-l$kmS`ZcC}d2N2XFb!-dsC4yEpC)?50 zH^0thk3T!s3~F;!nj@#)IJD#R+Re3Momz|_apML|y==^;qLc|BCd(!ZtwF)b}anPKG8fFUiqNbk$dcH~rj z_^09hv~IEw*{ya0Y)YfgV;jth+@S+AbtB5P`!?|yL~-PwtrYZAsA3UGR|raZ{Qy!t z=i?KG*FFp-aK1}{{zFmVAq-9yH`qszar15|sS2I3VfdC=X_bC)xuVp3L@^nGqrU3KJCH(~(ktu2|3 z;6y(Ub;@7!MBHC+BKD2-o55q>7}K8cj;X|IlfoK@(?w20-3Ei2;5clWO&JR_=XMz% zfK1R#82x=4cW~X5y^yJFX?bp-o^KJGVzAulQH8#qzhn#cm8)}e=u7(vR zhQ0sL)Gq!Sh;T(ay@lZ%Pu@;-N0FM(X)58m51igkODF_!9%?HakMJ++ig0I!k_;<; z;)%GO;mmu356moT&fZ>)UOTU^l=y>79f|KF9iMy;n7Z=Ic&>%FV#I3={@XWl zS$=nJuWfjM>ssPBn=GANnXqlq^*4;V}8?{Nko_DaM?`)@msRo-j1_0sX zbHYodlb-7!Z^kpaSO&ZFsuAB{Q4k%94}b5}Gw?(P%%t)y@{f3WlRk^Tns4L}J{+Bt ztc!iJP_j%7>{q#;LAnD>747Jo&v)e6#QcH{vFCPQ)=ak$R!#1U!iJ9ogy;gfAWs11 z&X52*=lQFOBEghgZUse~xaj@|$J9T=In+@$4b{l?#!S+oQnP`|7v7-XKLa zB8uX*TB~5@efGPA?-N_?^ju2%nXZ3$m%wsOzj)!TigL%m7B8yY5E%5e65u$~aKk7U zj@EMBJ=u#m-OqKnEPoVnwPc|&Fm0jyGUm`R!S8s?rH918|2;%U`?$B<@(3}_y1On- z{#6ho${}T3gPaxgh$cR@ddqq!s9_vO2#R9<-W}MXzzU=>4ZhMISCfV{>s%j`G8U{D zJ+PIcLHtJaJ!>U{FG>w9tE{l=@qu$cP#b5C#G?Li zpS+WxyrSCM*qP+1mo$?RhWBkOhy2+>Nvf&70yPSbT*vRrgB~U`izW89-&$($+tf&o zcl})`1Z3bUj1Xa`?IY;8bc}WkP;xzNNl1lV3@W4B`!Nsp5eXW_=l(I&=s6SfV#wJ$ zKIU!`do=i1iz$bB`BP_8A!iE3R)CKB_}KEJL#Wf_E|i^^|HUL#0l~UEwIyhDte7Bk zEZD9RIC-%t)pMuQRqiDK-XYVXy(&oMbku^vC|4v69u7Wx;ysD$&iOLs=iSR0^`2?J z#7sm1lt6ES4Ad=EQ7&G0^i|0K%&e6~B9tfE+Dw`JcJe}Q$wd9>Iz+|A4uBJ*f+Iwe zQ``2Rae%rRL1Ah9&2Ql^_(o%-0yQyGYsYw`eo2Wcmm}hN)~-3t?mC@h&6jYrEUzS zaRZJZ1h83h{7?E4i%GkGk+zaOaF1&2lc8k4GlEFS`N;rM;OW0nnu$X+xc)4B1btxp zO?Nw=`8!OiEB2y=jY0DH9~We3Cdwpihk7>go&V4bAjJEKm zt|7BtSMVzfzm;kX5%pcPxPjRab7)WNe94Z$lWvV!3MMd$9Fiy*k>RA1>Sb9E+mk24 zjvwLjcQw|UMCBL+QLqZiL;oMQZG*~L|3|c8pv)+gzw^c$9VUpnLpF9r;cqCZ;lCTl zagjn49rAAb9<{iW?wM1C=nG4ev!)DcH8%|LzHo)y|g%IcK23aLslFO1Ac|+9M z7epgadNL(1kzy$WV4bKc*b$m;+%3%45Am{H%lJuslUFT&joXGWxLGN7{q!y)o?W$V^!h@y zE)%6ose8jDQ6+2sZGWNXSDu5ddWQk|sPO(GQU~E}=;DH*WUzBz?7MQObNrJA(uw(j z_Trs|P-oIHXiFg5%FeiUz>)G-XSe`F-m+oV{vkvKRo9eMylKdg)1#(f!!|AR{s+}9 zC0it59@fwhu56``&Qs;wG99cl_T#7QgsbG@dQ^}a9_~ofjitYBtN{}uPH5MCr#zA)KVof_`4O3~ajofNu(ZV_v!%;=-48 zXBczKwFwJ09I96hI9q7I>-u3$6R~6f-ZLH~(n}AluPf`J{w`COGUJaeb433C{m}1V)Vem zo_hmzP%$OD?wbr{H*7U#!L(9 zEptHSTwEJ+FQ613saqxTc@q5F*TLL^!F`tGp>#uOpilPUfJnF*EriVSTCwLqL=#A=If z^GhUuFdbYMkV<%AzzC#1gTa_*aq@b^^MOcH&n%m)eLBOTSNir5EEo+U|In_q+zsad zl|A3%2CYK>u5BWK*x;pB&~MO*h5pO1dIijnJ@yO3RzRl(RFj$i+m<)9?ulZalT9&( z-M+lbD)%x~($a&_16#R7)?YqV3vNO3T832ErHdW|LN#1aoz#YtmKv*aXXy(pP>zf-=~s_98oo}+b~j?g@4qH0 z3?cCsLRt1*snhr?;1tioE@eHsTjhMCsr@MZl}WxT2zg7mEt{{czo6Nr!g#ZjQ?ehlS0ck;>c3 z$rpEI`<$qza{;+4J9mBN3HAQvJMEeNk(mFWWp1LJt{X=!K4%4Sw?bry!>zgOa2#3v z^rgfB0BLWht_fU_<<~w4`-B{CeJgmY1cnEO|``niWp{%*6NsJ2J_Ezxa zgnKl7lr<@Ph85}MseVKkhRpeaS`+mlgGnP*SKa4BQE*2k1mEv@b_5**Z9~ZKa?VAw z_FKr#vgdY>elFs8X>JQ{||bG^q8JSjmZaMUNtq- zlUhw$snXa&;wKJO0RhhDR0q!bO9Ldd1rghv0W8m(Fxy*_3F}q+2aTt7d%M9hl{ zPFOM}qN%0y62aDJGrk$RxQykj>-Xl4CC-i=U={5!ASXoVx!f4I*AT&Ifc|& zWnG(VWBEb%J~K0@oU+WBt&7?i_%Tm@J=qYrB=G7z_HoLSGOPzGl z@iyS`bllA{-n>tr_iV~78PScl4x7)QBRyH=d30ziZ;~97okj?>kSFUL^RPvPx0F?s zm+d!lx6TJQVM8x6OExl{2AUX5*DRqGoLkl4Y!_1_rOv2lp~a&r#mMD!V8Uf#BOB5A zJ1){PgETHya8f$%@J?Q?W_@csS8eNx-F+twS_y+DgH3GXr^P$uCm(_E#_pCJ8qo7K z58)m72zchMpig%Akul0UZ+0Cgp-Gw#CNHpxMO}j1 zB}s)&UwpHiDHVrrVyyX(4_htAUX{V$u9e@W$XFX@gm1n~F9R)6ca|&j5iK^7xutKq zz7Dec0HN~5u0dliiHLhZIxV(531BPvhC~uQ?}oosDiQy!;UE*+Mf2p1r$(hN@MDUi zN3UiD`VWowdW2blsY0G5@|x&Y<{s*cjZN_$W3|EIF6Z*`F`bBN`rQSCSz7k5D?Erd z19jS-r#9bQVAuXfr`S~ee`tQ5n&sNcUP)ZScfJ)n?h>D|F8#O9WE56A zn?__LK5BmScHQ=zR#@#K`6=7hQU_TlTJaU2p_ zM*Xuv6HFlJE!)wb7~UvA`bp~63y&tk>4ih6Ga~Da8ZWRn{T#%{^MH$f-~g2s`d&e{ z6T-2RKyE;?BoPe$8BoxB7ni-eh1GtP~M@_f$93cI2;-1r=Ms_eTBje$CB%+I-A~0iYLpppKkBA2Ie?GRzYtgEI&b zE5oC`g){n@M!yCTv|=?mo>vv*J@p=4;=as~{noQ6i@lI}npurPNBeT2G<+6owFcns zIB0O%nUT-Wt%Eehy=yQs?12@a|AN>v0*Sh+_=m5ns*@~yS|WX+MawjeIlqpFLV5#4 zJ_Lj34(dxi?M&GWH(TUxH)KRywrpzXCQeKZWnXPdnyy&~wMRK-w;*twKReT!aPi8Q zEUCP@4ZHpFQ)TMp_==p6NGM`NmmI|=Jt?$aFMdk4yzf6F%i>-g`-f&)3?b#PNHi2r zB6MB^gLda8BNd5lga{dYh-dCl7(9vtjzet ztkKSuv3?C#G5f_HQ#B8t_ILz`-{VH{)h{NlQU^ zeALWRAKM7KTQmi3I@7R3&B~7%t5m~*Ydf+H)BZ5SpT8#k@>dQHju! zVn@CC{U{Ms(oXErS?yZa++a8(pJah2##MS%1dvC~I);lv|7O?6bv&WO&lUf%iD70ApNx`(pTrb;6yZg{tvD2lxTRZd$f3W zGr!>`tisrm66uN+ECh9URviqiGtmfHBElT zGk>De$_xRbK;!BzM!=WYO3d+o4XN)4d=spi11Dpn$5a@DyNj=uBNqnZo?-EI)2X-qJ2jIt@&uX@xSIM$4;8>2#mm;9WgR=0CcIsMT=@e8E1@gp+}EAFnOVdy~g1{^T(djFxU zZFxDIe|)s&CCxX(Ultx7icD4Fx~2fwIP1^7#WgMRyVx@&oyur|3|GUGy{QMpAa4k}H0>AyyaD@(!w~0t4ptu(CAc zu4@rLyIT-xOx| zPRE3sOI~qhWt7FRQXv|$JbRR6PMAN)M#S!dd_R`#2F=lfG_m$r3s32+EpJ2i^C&nJ zolE_a(AmqN6- z*cXD&uKAZHr3*ZkJPIoG!mi4umItYaB40}eUq&SOtJ-f^htv+?e~#gilj16}Pm@1M zmq$NUdufsV$={-Qff2Oj%dt2**;%pLvr0%Sa$TmcpSD*Z5Yf`(43oxl(`vkJ`!0MMby{t-kion|R>mD7u z_ZHZt-_CSP12<36>;qeXLHiPt$BGP^n;v4GoQnevB*e8sn@?fxP098c@ zzJA?1V>Y3a&(BLHC?naH5>w_&-g7_E-%jye71M<;U6#MQYDz1*kV_tF{bt#RV1m4H zSxZQ9rD}pTOZ^fS8DT5`_A#(6Y(_EcWO@gUBXyA3t^=a=_6|?l7j&KMcxWAd2XIS-V>(27n zs4z}=qbtlowq(AWh$(6IHI^94pw#oHWXPxGquREif8^JDbJ3GI35#lPzPxi#W-B|*bnPXN6>{~59BCWyGbH#-hL(Qr#}rdnd~I5CJ0PVJ zPN`2abn7UscwGKO(7(6ZG10mc{=9 z#obF8;@MJjwHt|1_D*e!)f-Ekum%ZMK(;AqSLdJBBq14c7m0FW6XGM(-nygfCS_Q% zocEwGsQ^xEkuXDZj>Q?b*hk<-C8#yIoN;f{U@pQZ4vJnc4~K9^-FL2Gw_DRQ_wXRLLRAq`Z0nDhBx4e6 zp_5mNb&Ur9&*Ei1I#KLWJ&QjJUh6MvvmfOqhMkxk`BD{5@)jZ9q0Z(QS2 z@gdgQ2u#o1bS=^)3a5{Kn@Yh~j9*Tbq19LzU$u$)i*}&rg_gTcHaVf@~cn z5iAvpj0t@F%?}q#+y7Xm8rX(>BBW{kenS_--DHy@j+C;GxxdVHjJaQhvAx}ny_f|3 zyl!mM*#1P!L%osT;o)?q;I8f~M?;KOiQ%a-M%LAqNiDfjTk-9)y%4T(ykfUgM^}l? z1uD!gQs>%qj!I4jXWFA{>`_UT+dvIr@HrqI9cx`o{gZw zJ+;HW=DT^YAen_X`LV3=Z-Z->roVe*!kCO-JV*1$EA>>@KbA{oP%>3~U5-o%Q`yAj zyKfYB5~_PoGVQA*X{NJK(((OaTTwkZc@OJ}&OyJG#e&J}_UwY@3g#uoFYmoOKVMbf zlsU}sb@ifR<4eCp>5Py+WDlbVITVHe#xl!xyGN^eIuF`w*VliK71!RpTn=fPJ)!y4 zko1=5o$vQqb}M>}M@Oe81O>KaTeOQI7!u`-_ z!L-V^c<)JmuvN>&HVSxxm-KL<=S?0k6eKDZlN!mWnN*gLIw<5zJv-U-wfW_>)%><- z)fb3}N7cg;C_`nK-CHy_=BJaj5c8h_f;`!N{I|a^AKZ#4n#fH3awDd+3{c#k`4DJs0a zSR9;AU&1GjevVA3^Z#go_?A#9EjCj3Nx|G3$Oan0non1=+tS^Ss}4sG8X**rp*KY$ zq+sj8ce}nf8aIydhM$!RpxBB@b9goP1+Ktg!fbUbp#C?Ff-g}gz2xbZR*5MW1&KpB z&;Ox`c20AyYPL2a(ZTkc0mywZMQw&X_55F>z_HebgT(+X(LKakP*K&b-E)yH}>?y&ECRc%(SnZ-`J# zh?jz&iIZ!=1(|>u8&jx|ZB_`MNv5$PY@Q)yu|uL(8dl39w4`24Lw%}`yfR?a08oV# zF97#vhS$aG5^v)ptJi#}mO@xtip_urcf&rbP5{FKT8xA&x_f_U9($dF$z+)20hG$J!IJUVJ+33KhP5Nv($FzjA`%i5&pYlyG! zeP>BU6^P4@liN|neBaEQ&HSR=0&9yJkDYK~;4~;y^acJRk#{%6VoTn2yp|9ZB_ls$& zo-}@>Y37un`+A~3KX(#~F5?;i;#A+toC3@6SDInwEt;r(Antd*KPCTM%82OuCfJh$ zm>Ndn*+R5tUz-Mjgz<9Z>{`e8(B3vZvZ;8~DyMpnPgk|XReY!)a+Qs32>EPaEut^` zpwe|&LDze|!J)o8|DN7L&mOoz!;QGzX|bUl>YI1(ze3Qn<&zmfw(?>Gg?%cCtw|dx^2-S}MnEJ@&u~zFz zHIEKhy`^ysQ+y_I>Y;wWnH7T92CIm{_srB}*IdpdUkguV&FfTYcZ-*5xh&tH#jW zv(EJ6{|iam;0+{XWRu4SXpKgIv)^7ipVdZu5utbzP;vYQUhQ{o$>L_PwSWDkLeIPL z_~**bduwTBIhS^gV25r~8oLvEzYQ{lyf2 z&&qP;mkN|iEC=?+-H-n+nz~WfYG?yF^KKC24yO!qM6E@gKJhoDSZv#qlgZxR)qc2L z**+IVpJ@*I2wG^Obc`I&htBp~>)W{UV7P3>|237$_7UBRp~|EeBQ)hzba5bWVO_{b zUhcv{+Kr+u7-rp+YpL%*u*bb5;ykrstqV!Ey~r4POw!WJAd^!oJs%~RvSUi@EA(6a ztZN_IK3Sg)1<;JG1wq+Eq5tIYEIl1_>UuA>kX|2SWyO`!Hi~A9ktO}+I`;UuMBT5S zN|p@#G=O`m7jJP4?lXfK@G_2ePw~Z&SKp(T@jhhqsW`>(3n?*tiEz@r6+Ig#Q{4!u zIjvlmv}lrw{Ct@73_-FE-BC>89M*UScK>~>BVv?B=w*UD=CV{wUuud7-jciT$2H!$ zaxD2u^ar*nTA1T9leR1SP}UK@uz|C4vhK_5oczfHu|&4BLik<0LtlMZFWBW{g@28s zrGxv4oogeqFsqizQ88K#?jaM7HBjUrgcV#hn)sc-s;qeM1` zo;$DF!sClUFl(2old6DnB{M1uVR6W)qr$?5gs`bB^=aKRaR)?h%E@-K4=KXZwww<< zHC`oNIf7*-xYAR*WybQPmuw@xi;mP4^?_j@(vkYeI(xu3)Lcl{K39!TV>!c%nyKyT z{Y&4a4@j@q;UhqJqxQ{zAE8j-vpIhL-#144z8I9l;o_NJhQ^S0I`xs zC!rLD{`4&X0$Sl!N-^)=^rYh(IJN5d&v;ONnnUNDRa5|aNCX9y<>dXus(5!gY`@$d z6Yr22b0DXi!t+5fdAVZ6i;Go!=@CHPflrU&kB02qP zM`Wq_l*vuoPoz|vqx?ue@K*Tu-BiTj{paQTstZr0p&MyxPWxUB&P~>qu$xkkBQqus zN5%M=eXFEoD>nm7n!KmFeDCOC)OwA)AAR!bEsmrdOTY{>bJ>eWR($rXV<0;6Zn`(< z&Fx?qBn*JISme?$ki<@?`iGV{=!e!@fb~;*IiUx!ynRlTMe;j>ahex?_(ft!c zuFbi@@w-8CyH93kWghy-^q!@r4+SI`i}UrNg;ntU0qQKZOkaxkeh)7k%?`zmRKVY_nqpB6j#A@>hnoA;`;HepKtjNToyg%E!TUAoXS&!4U>;pF;v;O={-@hx=_X98~Osf{IB%Y!zK1SeFtw8R! z{*C`Dcy8}q+;WYU8iI83P@6wO{)9dWRpJY3$Fo7ouaj8 zw4o&fi^xs>Q%R9`@=Xo6E-drO>I8=rPA${Gupqds{7$G$xjAW#}U-cnOyG8EsZ9iW_Wspi=M{TB@Y74GOGf5&dtjfM? z0Cl+-$w~G#YzEgjaTFxgQewwLuiPA&3?@el&)x{Xp65(3zOTlDizTz-TzrI+abZ!oie&f>Y+ENcFQheyK8DfMH)_f?2K&k@gbecLJ)$RU9 zUd~%ya26>M7oC2{s5u5+`uvGQ;4d!$s1gX9fYkn-z5rrMdo>7xQgzFc{i4 z;_Pi%+6r3LYEBkG6vzegK)KQrB@f-lr!&Fd7u|L=%Ns^xEiVu_lATi3eY}0_ek9o0 z1?Q(mqSp}^NvaH*!;DEv)V2xY|Iq3p0DD*54h!%0e-_PQeTLd&qqhtycG<3!{|#T~ za?a;I@MQQBxVN`&hOl_sha$F4KPTyn|Sk*2D z6#v3ep5!4lL|FV9oiw# z$sUvyS|h93Y>GNn4N#J)MX~ZUUeDUsB(L0-m)O~yOBeYQ2}$@KDvIYK`pZ+-oz*6N zS?y;%+Q#vQ#A*MbZI=aBGe7CcY-;DoazOGt-Ol7l>9XDm`7FJV9&)$#vZ%Yz@s;yK z$92-W&Q`cnWH&tjmGigM9v1kX=CK-5A@WOxH_SoB2%kkfbdlC_0+2Fi1)e zIP@g2X!JKibIOUG<1*bcJFpk!ooTq_vQ&7S(ME^at0k4*8d^PmsYvPasa7vhBwoO2 z$cRT98w_d1`QU|Rf+y?Enc*!l+-^N}g*lMSR8I?TlNJxw6G&{Iu~nbA{pIw{F%)e% zK7$FAYh0hE23e=pj(>E&C_n*ZRE>M`7T5?$?0z+3UR4(z&Tw(cm8A;BbJLs4sY76KMKo^^TlJLq^r?PZwYYM(n5yw;C7ZYH;I3u zaz(_a49?4;@5u*-C-b=T(*@QEv#5pI%M#D~Sp15Y4STCrM%WbN&MF?t60~a%!7T}& z!an1vQ$+ZfH=Ial0B2Plol7-}mK4805?dj=LrI^{FKREEDuX1-(!RNVPUIM#nRWLw zV>MZHF)x1nK@g6^u;5G0S~ny~Q9Dz?ZVBxv_%^D&Sj>w3&OJ6K>l)Ei+m3I4R({~j z0Fa*Nr{=fv$Iro?X;P-b(a~vjbY!_vW=aq2$O))46@6qY9=|H*I>P!jQ3@=JmZO_| zX@#2;%7Tdiey}z7FVJUelQEpJZTy-iCZt^FSDxG#%%S!NI0s}Fwu*bQt_Uy;a74WP zY-`P~OtDV+^DVyAX!egq)#7U!S6Hl2$NmnUp*)xy+Uus|@?w>j?7!F%M#@ zXGw`aft{=0D>_vv9ZnXu9vQyr}%#q=%gQu+@OFWZ&1PVItbqa;hR6JSb<(7D=(| zagO6NGgI#?cgqLhx1`y5$yg;Ifv$Irnt_PDyHz!>A2VdV3S0H%R73kvP6lGArU;;-cS2(~ zXG^8&x~r}n-GfW)bBZqv6_h{Fs{uh+;4ly>nIB48fR9B3aR>QP^Ql zFzDrJOaSWZlrw;|6t*wy)AaWV47BmPtGX+lO$yzf-QDG|y_08<|EWErG$}+5-g+@d z-CB44U17Qm=xz4-M)?V-@TQ63~9Dg$QppJ~~oXJjC>`HMc%1FgcR>Y(}4ud4;-U zs%kg3K~6dso&d_68(75Y9jWcX1rf1fpgfK&QRgkq74@C`@MDI8?DEA5o`u@TPu+u} z*+;e`B;UW$*e0knIXY4#)~59L>xDi;g%dZAT-g`8)V!Ox#Vxm_xL^2n^j}uQiObS0 z#@+C$sJ^(!kLAAM2{z#eP znA5jAlQXpxHQ`^3CzaV0KGZt6f%ppEoXgU^Y{DytSLJ+X1;QNjO7gRN7RO#MN;kYs zd&NgyJUT_x#TdY4Hd+*}hcuNCRjsl1sA0`l3h_(Oa0hjgwM`!20Rgp@tYf>|nMpZs zb??Ho9MXu1{mUG~$d8}>8UD09h&$AAesf5w+0I^KR zn86FqD0@^|}<9^WlKI-8SvivHJnm}d0 zqHXJXiM-S$o(5kda2{!FemJb_iCwOyw=yNnQMd{uPm%us5%sFk+!$vi^Coe@0~qw? zu3u{Z0Bw+Loo(SH$X8}bJSbX~H!a7?mc-NEHI-f}3G*yv!>qstl zTwdG=8WI?)=gWSB9;2;Sy3}FR;<+w_KP?c5fK|B3Z)|3>7TRl-+UVd8akr67^VZsJ zt@0K)!5wN}H&Ep)!7te6-EQ(KDf1H}cRk59V%p|04)Ybf?iHMpoHy2?wz!5lWfCsp z7aJFX3H7RS&m+#E{%#Klxb02*s4a7%u<--CG;amETM?CvCf&sQ){Wk^CB3nZ(%RA| zT<~yzFIwX4e$w-KPZ{^!)N$!oTToFbKqF_}w{F3c?^`Lm6UeS@!K>X|LKLrMnXSW- z8Xwz`=l(p60)2}Ssi2N&^vASEL@=73y7nT75E1H$=M0XRx9H64H z+TSQ7RdZirc@i!*sdoaz{{U{2Yx1EZ7#~qr;_)oV0F*~2KfM@V~Tk)A97~t&2 z2iBC`O1A<9kh5byyXPJKDO{%$ue;XRY1Vh57}&_Jo?CN{Yn{2ip5jy~F3{iI7z@o> zn&L+Ayiyq#bXP9EX78sJfMx|`m(EX_xzAJdsOD%R6}hJ!<-iMrDFFQ2@%<}G!uC6x zjgv@W$j>NoT*$g)$VqmA&QzXi&9%cmOI-8r%~qzxC`FyraoVD6@@2c+rM%dF@B6=6 zq^~nDmiF4>0iC1wTy)~7X`21qWmLQ^Yab7>F@gB^rrh|x;|?Qem}Kq!-%4strp%=j zv@hD~@Za8SjdHQ@%#rQy+zP3y-08Nuq5Dm>z;;l|P=KbneJfFt*5usVIWkI1nA4#- z?kWvKR-aP6gv^Q^5UvJt+>=>*Lq!gSooi&_XNUx84oE+FezlxrYp5W^zE~3ZS>T-e z`&Erio@-?eimntMI^cAy4>CyQA@U<~oy- zt@(9ewil=SK1m{vWUf`RLNMaI;!D}2Ndq|y(r!5Ej@8iFc$L>rHuDf%E)(S(`yXo7 zGc#)EbEdi&pzQdT@kpWNCMk!1k^#EoSSzLI-(Gr##dEKCl3gst z_Y^%_t!>$$p3?4CEEX7i)xqSPfCuYBs-A-OO=?-1HhbZj6qj>sYLI#l|TkU>B(7sY$JZ$(=>MvkkeG zghg1mDr4)8<29iA}B{Cyz+8lk+-|Jj*xF2-po#v`IG99BC6|`zHJ(_OkW2@c4 zcO=&dAS%&=%ldj%%{J-=vD&f^nYRTk>IHDmcnAkSfbCi~FC39E+yPbH&wADqu@a`P zb(3Ak9B+piQC#@cZXub9NJu`k<(bhGqp#1#Gv1+QP0ZNA>x|~Kb8OF%BH-0-q(g@M zyHYNv#`}gDU({A{pXVib&tXwH+%w5Ex$Hhvb{4ZN(X5f2p&16O=hC^Y7hG!@yon%L zVUTV4&)oMFvv;nfP%MxnZ01d+4@`R2k)6!t8){q+o+@4Otb2B4y=yuhUg+VTI~c>9 zD17r>4Xm)u3~tfDH!k8qrzu-;MJKV9!Rb=R5Oz_YnELTkU0y{U)UgMTZ$-!9RleCA zkFYo-QO?-;Q8e=#1;hO+N0W~yC1W@$kc|; zNx&(NndXnPJbqy)OC~bETB8q$?IE>=*5hMvtr=hTvzkTm1+B%s%#9-l;NiRc)GF5x zW@xM++m~edivN^VIomn%XQ%0q>tu>?OIoJ`LoaeTBR=ge{N4ME`Z2>q8yGApgrE;;e zW!=sHfc?m1J??r!yaMiIMPt2dH?L1yBp%YK3sl%gi#_fw#* zQhjk{TSuC2-9W1tKD^bW)U@xi-tO}l6+TOL2i#X&E0{`|+iCMG=^RSqb0L!i_N#3i zsd8XA`Ijq$%U22i00|VZL1^~v7%|4nxQrfZIJNt^HLGihh-7cwlb$yE(e3WT*`unK z@9bLisV?*zy^D6jAMUA0^S-*etj3i4m|dk`KKFHnC4NbM>w+)5Vt-#!#dP+aLkjs6(&Z zd8L+MNNnS^SG6V6Jq>)^7E_fSYB_Jkzxu{crfZge6->6QsESyw4o)iy=Jrd2y`8t? z=I4TFm1KmVbW-@J$9p2h5+auC{c4rxh?e3RyybUkPcf77E<1|mTHqORxbEpmAUG?W zaoE>0t1F!_glufXb0zC}!KK2F-fVBUzM{7!)h)%ETgP?gpW+$s*ygzlJDDw1`8frK zHy=uDvxEz~a6RyQRr^$t4H{Ejx1LHl1)HJ&01B4gd!M(fLnLa_5tiT%oc%lVSnbi= zJ2nPaJ!)+Mp_cyuZQR=uRdP!{Glf4bX6MbJdji^vV}x~O+*n}b5NVg#F|=>Esa(B7 zKBXLyBoWLD?cDT$M-?hv2XtD5!5{BeB0+}S=k%_E>%=o^Hn6F5 zjTJWRRqu|ut|~p|c{jJoj400^m0W-TH*W#G1y?&nSu=lA))49KEcr7O*oU_6Gx}E; zZMWaUa71eIOS@uxzfx;nXAk7ZBPbi0wjQKot!LfB;zmeux9*ZnCp(G6nQx_nS53om ze4B^x^G+A`sL|XMiI_{j%10wB+&9+-sK;jVM=3HYEK&ae-%XyFs9`YjG0^_0Y!OdjLI}3$=2Ik`*!%Jsv99IT#d;b8d#~h9^ zOL)Q5Zb-wa>NCwtljbQ$LX7iBKYYI{?#LstqGKSr^R69cXJIQiISrmoMqhc^js|}^ zOP2l0q3WZW*7{hB8ahU*bnQv5JfAzv%Hzx{!L2V5c#2J0XM{)PBH(e-KM%vzCtDD0U3HDf|3m@ab7|`^BZ9lJlUn zXyc4Ti-zOo{{T6!M$j$vyIWtk%W}{MCRa~ha-f0!c&{?k*uJ42!U9a|lb$%Pf?+L| z=n`T^_HF$M2R%h-r^qf_mvx!$BfdK&1u?vWeR@?3i|M3=A998qf;jzZw3kaHw(Bc5 zme@u;cGE8g#7khT{1LsoR$gY7(Cjn_;na0MJMG!?GxmtiM&=4n2fsBgrs%qw?Fr?u zCwEL7*C}MaT5eEdVso?;fO=KP|er$c?j&tY*arbih`<&a2%vl&7LFh+HirRaIjv}%$h$L)~a#?ZH z@TD1z#^-6`8JyeOnP&49*XP(WF`Nw5$vjko+9>T+!Eg1T`r^42xEFTi@P6(#o+_`N zQAlHr+zPc*QXMm~ZxMN~9yek>Nhk2dWksw{XFHhXBYEA=IrOY$mDSa@{HHs=rA&Te z2;@=D%ZyQ>HnDNN_ew}(yVzA1)MYUQ4#ju<%;a^bB}P``kU8hQDnbmq+ZgMCOPb^5 z)Usu8r5(vY`3JRFwbTp}5|4$+SNwX_cG~daMoPXw^)(!Xgxsm~QFn2)O?9eVva&pz zqh!M$e_F3C+|nlZa@%ke`&A~FD(Fuq?+SDZ?-TRGAOLpAHFsp#kjdpD>?b+n6+D+N z$~QKD8mePrHsoTbf&h3YJ6mw~;-!k$C4x<|ugn8=9`$bDQwUx;7kc@AR(pY-wVvGG zDv{grpr118Xu!Zzj1NkbU94zZYB*gaw8I_MxpyYi%>8Y--lJZ$)j@mSKe3U`L z;8tvs`Ef;nqYr|9w2{tIR@#1MJ8@0Pv8bd?425BiGDab0WaFtlN%gD4Rgg-AU@`-O zJ?k76EPk~1bz7*M9Bu_$^36VHVObL8&BMx*ISe^m)rGrgUFArI_l%(;vueCVWf1p;t7DCYz9{OclC@?=re9^UjzaM@(oTWKRbs+^?tLgh(V z-+vKHI<#vCn)0zg`@5QT+?Nu|b9s3r&`!;@kYE6F$RCiddLV6`B|v<|ee385=B!OS z+=(tFE@gKt+3%iB9R0+ihRJm2anBy4^Hl`fK?5!CPcEwy7LO;VCZa`hgA2j$P~=ib zV;3Jgfm3;5(WO=cx$bH?+sutgA9sU_jaAV{%Gl3dl`ZI1wIx{#K#(v2RGx|fH22M{ zUTkZ*Ps};>?@-ApX|hm_-A*c_qs=(-`1Y*%m{}4n+|n4($O!15`g&5_AGA&t*982- z9067m*JsQGPCoHH)|el0aId%P(9=@8A1#o^7>YCvh26AO84;LBlLeD;+5tRbr)E?o zK?IJ%nvC9V#vyz2$*pXeo}$Fj$+zSP2Ti>ND0h(TXZ%3N6o|~BlZ~zEO?GjP+>CnD zY!WP7o(MoxCUJjRwRm)jT{OMed( z&AYa#VH%&}J*rrqDHWvPY~v@?)Kjt*B_DP=;+kS`(&U9AR@-eUmv2_a-^!$pT)s~Z z26^vIfX2~?$=k;#6zJ`++s;`BOn0f1<^-%+Tf3K&VPWOt<|w$wwNrzP~ zTi`f!U%YZNnvE`u@$Fn4-3NM&k!f69vZ*eB zBF84&yo1uJEiJ{!`VE5P%aOdClFgr+kyh^GnV|jJZ(+*;P{n9NZcLtfnn}J^<Rm6y< ze2=}r?N#1Dqx;n?j~XEv$?uA$X5WQTkF5z@Y>>)WU>sF;40;-WB9P^I%~2y3C+$N< zIyqd^mS3B(sGN=Fn1^{2DiIW{Gw6EM=+_WiMkP3J;@#BKAh;uyMr?QOP)~Uy%g#QA zle=JxEpRy@d8USDync}X`6A4-<(s~VNguS34Q zy|p&h%cn$!SN|w`GFtZr85*KR%yU#wx zo4F{O7)`a`?j}h|`Ilp0AE7mRTa?vi`%1?&cQG&}z#RVoz`OocpCUY;ySKcBqL2fL z7xd&+?K^ytNuAqxK9!=_S34-K z#k^M+kwc}TACNkL%--C5wQIvVExd9}64%82GgDe?pJu!_Lf#l3Zb^nMzbX20S|+R>WsP%8zSXt4t?gsk967qlRp@&hR<@I< z-|4!98gK(&|#z41RHtHuM6h>Y9I_bp__1ZqF>A zE21Cb`gfvoO|q5NhgssA%b2eo+S>3dhW*r~wrAByDu2ScrO|G+Jz-*oSd>lDNZ1uq z-|qCL+^zl1ls6YYyPDaGByp)?zQ@qjS>}?~bei1Hy@pGw>{5Mr=xaGj`w1fZS&c3v zj`rl-NUigj`Py@gb5yUcZf*z5a?(Z0ta!j*{=dBt`TCIBjcpUl(noR|KVR{wVX;E^ zypThN#~VvA{L+)mSLKqr&`T%JdYCjDoGn7 zmm}~Me#coh@x8URoKahOAk3yrZho7w+#gD5Y8TM;KMdJ6pJedeU&nTnDv%_>BiMu1 zrk2Vm8bJ&(q#?2buuwk=^PLY-Wu7^%MVp>V2pc31p{<=&E}X%rtkJGh9%IU&u=Y}F z(^t?=_SDp~c%hL4x`vI9e|pEdxU|)wl62HI5_vJHkz9mB97y?WaOX9v*+kNPKs-sxn_R8rkHMb9gYcK zRyp^%k7!u|VoA#ke=5$UNo;vr{{XL4ZYGS9P)as7@0;#{+nPm0m3Abq7-a5Acr}&V+&v4HS8s1` z<+2R#`+>3)nx0E2?r(#$kQ4YrVEZttwzjAygUCT7%;k@n&~07?PM7jYzxOeTRf}y@ z`GWf&N_@+x`EEW~En#Olkr|tFb=rM6s^yK!Y%lZ2z;5)fCz|ZQkcSxoy3^e60pppY z3Reik9OwT4)m-vvy$f3urIBJq^O+Env}E@*A;V_}EOCtY{{RY0hKfnV>Vs(Gk=O9` zq@A6uiCIwYk3R7KF5gs=))>ey+NvsZp8L4==P}<%tGocI;mJ#xh!tw+(W zF5~-6!buzrFPG;?erL(IUMg_yoq+U;?XUw;aR#@foz*T1FXdZ%?ZiVrR!re+wG^2CJ zCc!7S9jhN!)#ki!Hc1Oe6h@$O0H=6r{Mzbp-OX|3#}j3^&#}kn`cj=me{>+bn(iU9 z)aJFCH;&>pQMI}-?TVV}F%-~4cX1ps%ovTt@CU91XKEV7rRJvhvjm!Ukza86RpS{` zmG!_CPUFN6CE=M7{FXoJi|*J@UAgRVIH`4B#VDg-rJLMF1L} zu-nNd*7=yWFuzKE)pZ@2B-b-8M&eztrzbcRNF#J&NdWVc&oxpThLPhD!X*u#kp1J2 zO3~9Jx01xos3D5lH925;EXU&lRWo7bZjd36f0+anJp*t%BNX^=~T(JutNhR`66%U^5@JSrbIKq_q znLzfauC9tg?F@0NWaPhJt}39Nl2p5mFdv>cJBJ4t?Mb_`Zgps;<4jgb#Gk!oSjz1x zyGBk=TB8s}orcsUTMF6x-%5YlRV|~rwq+4SLarY#wX?f9?@=bFYYne3N4?*1EI#k0 zXr_g!GTbe{nIDw088CAH0K?mgNgz`)lkx_QTNuY`Y_Dr`uzBbpW4THmRAcVr{p?h? zK6H`TT}L8$vBDvd85NXm{&XjOSZ%Q;ogKx*cOSe3V^SZJmE*6{s@~cpibDOfyHE= zrSj-UcQlemZtR;;9iuWJ!T$h(RqH!jDXzZM*QPktKmlP+AL~mrrc`tTty=+SOq|ff%Mj()j$2}`PT$1rd+fkeJ&0jUQBF&m}3m3SAEVmCVoHI$Z40ij! z)~rPpv=S>SWUFrFdes|uNR&p+%Vep=Ow7&(&Y7nwkw~Qm*cOB{7m3Qs#E`swD@w`W zyYVCg_mTO7Xp}G>UKhP%$0LqDUJfe6f?IT1B##LvDnSOajF#q-i`eULtn97)HFs#& z9%9LGA9nCbVm*HfYM8%*W=qs<^X)!pKiS?`67+9Qd_&npM0&eGB#O&sciKRDlzjB)bZA;T{D&3rFmVuDwHzHWKa|` zZ99tdjCJ}|DRmP%3yoRd=PcPg7tiygNsN$SG2{4aC`O%5GmpU3_IJPu5@(HD_(>S! z(w8;R(1~!~aA(5$an`y`8ug-v_C+8twIl=F?qBh&0)_XxDz?se_dRP&8EoRXGHcu{gev&eNm*^VC&+Y9x;4HblU9Ba>+LW#+3vFlS{4HMOh38ZCr} zt64-({yZyLq}9nr3`NDh-iJKr(uoLQNC!Wq6Du^0jB$oT1_$+~zye$y!vm)^ zDzHGRI(H_K^4-owGOR+(k0~3N5`OkLs_mWSGmie1VO5)*q-V7<@pn6UXDxsZSD>m} ziK0K0bvdN-xb&tdEytke+M{_kHqbe%v6(QRZ(qCgrtdO75M@W8%|_dn-+SZf+)~4G z7cI8{3P6k2?ub5iVUF8sXbzO}$oKcMM61f|Op{sa;!Vz6fWJ3A4OK_Bl63QJ{MGXn zq#Lo7&my&=lX{U-O>A^gT=+r9I>pcZ`cYF2N5T-c2Bmo1dapIl+v*-1(%mFFL(S!1 ziQ^(fW3*@FZ}xfpD-Tni((P@U&N&ljnxz>qLYWztr(O+u)bO#FbCOk>TAlCq?}Y;> zj(fRW@Tg<}nz1Cd(-3W-xPcUW!gG>Ot#EI7YHVz6f(>P@sPz`tvmVaD<~>(aV!w@nPtTPS5w9@S6wK&~PwXNGnc5{8mD<&gVkyB!`_ zu4H+Mg(i+K^_LBgknlqL^ICGW>}2h+Mk|j#WG%bRleq0fk1eh9pmLHg1XOnRj<@Xu za(R+M=M01$*z~E*w0e%AIl8&~8!{i?GUqH09VtYvf7wcepc1@dq6ff(NHP>U-4|@QJk9NrlJFk;wciCf_ZKid@$rb$Bxf zbvQXBRBrfEzO@p|@(s_DXe_*(^sWhRb*osfD{;4-#+iaw865UB*(CEBg5(!yVtY1YOSYN~tcWv$}Jq)TR49n++s`vAlG)n7An za7I0=T2zohJdLedGe(=1F=V-J21f(j{#BS8$&xE{xQUUX^Gld{RzJg9)Wb?TT!>bT zp2JU0X-CLN{_lE`EWx5Ul zJt%u<3G1nL=^hC(p!~f~GCw-3_Q`P?7FLXJ!#AkSPV&em<%=hvsBaZQ`Do)6bU9JqKBBEWoBn5Fa(-qfCX(Vnmy@F41w7)V zwu0LB-gjw4K3f2zAA5>&mGu%V-u6g$OuIsipImzWRRcRLiqf!8 zErCtL4I=*lcj;E<-vzGt{_A!r#7* zkdUSm7+;eWlWS|bKi|0vg2)i(>Ty~Yma-I+WS(IKb{9Cp)N-Y`JF-ak7{7X<2ssSB z^Ho$vk^Gq%k%oOq09D7{BY3w5Y}I*#2;+<(5{6t7cm=xu07|5WiM?e}?WSOvVPkM4NLz2TiDI-5O00L@V zJp`szW!xPO-}ix`>@m}-2%|1Ctk@)bRIPCv7MMT~u-X9hsO}J$o-o@}IoxrD^{VpC zx_K~17*!sXDPK~X(3#nKjyG^VwN^-(CXq}0n*$GGMJ%@{HY8}zwCy1Va!-1=QRBE- zB~;i7swOgX)1PW=l&(P0w9JgmTWW~O?VtYuRXJl3u_ti@&j3xH3x zAVS&feQ7HW$8+$#K21(SFn1#3)9+OO01{w#51BuHeFO@T*w3lq?19M+dsF^}<186tl%pE=#%(x(Uaj9G@<6N;+R!x9Mm;>@QU z4h=|*w*gy#+j$kt+vs#yuf1AecWyhmEP7E4@;GlWH_GIIy&9%wS*Mj*Lw@U~KI!dF zjyWP+$r8p}mHD$nku6+?8Dx%IgF5XF`Hgd(t5XB!A>@JAid&fCk~L`L-uppx$K5p> zNT|C(22h_aIs;j@Qd$z+Z8pWRnF+`Pr{h$X6AU6tg6LBN9odao-s_GsbI&{qS&~L| zS7l}Za(fCo=t%)}V95f45jQX=uj5JPBuyCeSccEY=fAZ`A#xZjs)^Bn9GX*PfDb+M z<^KSyDU7y#Xm7buO_knNCuqsXYOfqlaQWMjo;r%Y=;Cd#44nL+btaK*W=Q((Jecc^LUg9mPw`QT(#0PPrnkF2qXrEBRR5 z$E$HrNe}PGna+3^8LJO;yGNFxpU`(T6ta1DN~r7u5Ltcd?{Z0&-HUBiEw`vzuPi?> zETum^ozzn9U6Noo@YGx*scIE69jPG8I_@Xhs%-L$7LlEIWmFz1 z_CcVPgwitx`?%zaZOK)<*sHk}M(ouiNRmni$_j2=XO7;~81WuH;Oao9`GswhYX&Fu zsN2kCDZ$CkK&r7aZ4$!}k#@TgyM+V2SXm}nf&dmH^6^vx9;Ye?LQPB#$sunjsn5*B z=B8Q#ke3r9N^(iT^{cVSvBs{6_xL&f?NM7lJ>G50h2Ra?C*GxwW=Vcun5ysxN|&lM zjI77UH=1Tdj|Gzj&q2*W9ot25Bq|bDjBE}`=RJp@r$in?UgI&hQGrf2k*q`IBP6$( z*{eI*>U2F;f~?zZ(hbUF=Q}w9tArv!y#Y+~%~?|#LFFRjCzDQ+-{ss}r=?P{g-z^N zc5}F@Asqssz^B1(gKNqiLFaBbKaC=(kw}c-V<$9j3A8f5&D`dLHpfYZZSHah@b9H* zBHg(YYX1PVYPpG|lXl#IbIm!no;Qu>qKX%AC!TmEw@6wP9PST}Y6)E;k7Oe!+LXoT zOfux+lWHi&-bbxog1FdZQ3D`{Yd2OWnuLdDSkZo8T1gnjMtILkd~8D(-Z&g*)|fU) z96<K4xC!d(uoEYy-!=2?ph6myeP%Nc^hTm{QGJP!-#t zs0FD@fF48TkGykGU0gKxA1(qf3I{k9enD$$GH)_zHX#{4qM~?aM%#c#u4?o*vwX>p z7Pkq$uP9@wPRZtrj3eddml75P?O z#mjzmDxRZu*+qo)IGc@d*Q<#xYf&PFG$BmKV@g z+{ls`ix6HyT>4d+Bv%TEow=w*w93H|OMVquRHZGD&WN=LD2En0<6gH z7i?&7I-fySnS+gvYD-Dtx${xg*LSxcTH1$E{?4{@Eu3#3stghSH6Pg^+~z~(GaT|T zKPpB^Cn6%@VP49-pRFeD#+HSPS=!}9yW;~H#_SRJQ}1r#xYPWH4V6DG4iD!~NFahe z>=AwOgH-M|%gwAMq&)^}IcvDmGEDCh?cPZL0BG@B+Du~NRgnmFL7XA4AC^E;2Y2;}T-IW>{GHJiJbZQ1SBTicybpit$Ds zPn@25p7pM`X%rq@ZV}aT7>=f~OP@Qh-s!Y)Rx%ZoG@E@Y#EIkkO~i{j{OsHf>@inf z_UWt^2TU*AFM-FkWtaobvN7D_9QUn%u-waa46wWpY{~QP;H!Jl%#hJzPKg@Oh^CRY zAG%~12kA{%uM^BvyP7rMxm-Ug-lC$Cbkl9v%(0}oDqRhvt)VWaKwmtrFBO z%`1tKS!B7ES$xbS+kQ}S(YWvLS+`ThAN>)|*T`|s8|o^|*EaG;v<#~BByhC?2;-C& zb@n2rQb3sm3lL`#Ld5?74#wbpMMzw_hC~O;>q^en?-_;Jfl>+MEkJ}@s}^+|hdJh< zyNwsE^%NG9#Vm8XFdkbGxJG`y^-|(w@~vNX*NRp-k)*haO`OCANk;x+s$2rF zgU;N6x~?syVLO&#zP2~Ac{XyJl{hF#V7}wOH2Zn2)JXRzOgx>7)ra>*Qg~Y4Ug3bo z;N5AK)^b52H<+?3b0VGyKh~cyCfr(-6U`di+cmnJ6+_i)Q&XSqHqmYJcB>KdF7Ca( ztCo4=w`Id%lYnbNJCtaLmzI8~J?K++3*5FZ=7uAZP8Vv0?dee^&D)$?X_sbku$mDk~)4>l}WbrG>x_!s_EW*8M(V=QOc&=_4n^sHkMOF z$kwQ-E?ef>&5x!lr(&g9D%`M_dd2fHs62gXDKW_-RvYf2* zkw#qnrTusGH}`(4gc1TV}<&mPrL zw9sLh{NWR*Ipg?+LYP{`=#YS(Png+HzRQoHrq0ekc<5ea!jgH-Q@6dI6B8NtqW=I0 zA1U-Xr^!C>WN92bwn-nwQP)zs5(|Umf2xV2X21uj^%?Z0d6G6U9>q+B1Gmzkd6_M? zoS4;%U{p5OrbRnW)y{eCRmzaASe!W6j>J1CVZC}&RZo?Yj8t;iJq1{6sV0pk-qHY! zuEca+l+)GPR0PUC6MtvM^B3xQej|U7MxfqShsW zlzg4Ldm6#Y!$^%~xR^b>(VL`=NsDgWsqKcx=S_y#q;ty?B%WHU0U#<|j=OzvO!Gy& zu!#B=EWiwW$I_)Fw8A)g-p)OEgHCEwu62 z_5Ep(+xgRlMmyN>6-npPteFf^zR=GfA;YEXjHd!wpHE)O8o8Xv<&pk zYe{sHS}btGnC4OBzh__K9;#~|{xvGk@-j_vw6`ZXDn08aCst`+%lRW2Q`ZgeOAXhR8o3ez&AS*G9-oCc z+%9aza~sYJA}5y?*vbxAlirqT9JH{_Ba)HsIr)Qj$I`QY+^-yxI}oXWq^~^jjw?Y- z5ynZ2K=58$-lTDo}IV8jm2w71v!Ok=Es@D=p46(@( zK3R4ly@yJ#DV(^&gC5PFBXCA@R*Qe5MIPPYVVkk{#WaQ`ubMo!Bt#YPL!o5bjw@yx zxsn)=?b^LmkCw6EE$4iy`vJx`bgOGNCdX5p5m4^gXukKC31Zgj-fT=)AfG~U{OYV^ zNgA+`k=X6>gLJ43&BxpW=2CDm&;I~iP#vS#dIT6OPd|-PzQVZsfaJ)IfP9UN{{RW; z$oy$V=E$Demvm|rpO>yGqt0ZOCqP?q@6&@$pUqr4?G2A!rnGkG7c^0jsq#rq zMd!(eJ=~GUN`m4@*5z6}D&q%tCaIwisSHY=alxb{7RVzf)~A_STluRzG26VYx6EP=Mgi;XjMWso ziaDlPVlCz_&G`GLH8+Fqnp?@ybLF+Piz0!RW$Jd1TFQ~i?*tRi9jkk^iN2+r`QroS z3V(%qQy9+#o^S+ng{W-WNS~=-&A8>M^GlW4$spEFPUWR?XOOAOU@_$5r9LgqXFIFx zVfQ)Y)gPB|CndTNdI}u3kX263GCexhu#qhZ8t#R2_kh70vATk&WK-p;WMg-HdQ=gs zgd}pMvy<4=sHP$N%;uE05+Rm0NqoSYLm%>K8)+LihU zZ=4oPZNRFgH*)1k#(VaueBPfZIHws1P_54KoO)FvY%^flw4O&4LSc?IOa(j0$Qa%0 zpIVLNe1o2xkxz(D^AO_-aOB_stDb4GBqZPA6$05@d!wRQ9UTx+yIn`T^RMTw~_h zk=RpuF$m8WB-?clt4@PTw zY$V^i=M^KYZWx|-aBDek)z_IepIWsPn~IO(>_ukoTv9Ex45XT7=Um{UD>9z zs|uGaLCNNuBe|a)Vq_R@#QIQ?vSZ)nIVV5GOlIJaPc)IfVh?P_Ge=34-+%liSsVz+TwI?sI7UUyved@Vrckeu(dV!cP z-p+oszDodE$VDE#O;;%mlgjIG>z;zEPObMx-N#B{EJz^nP9=#`fwr1f6~|8_3uRsZ z05$-?6qbPts*|-z8_!-TlCX9IwJMPi#yIt+wAh}fOQ+arnuXIvZ^9xS(kyr&^XvFk zn_moAT*1HlM)756(ULO2_4~l_#w(b$zk3&Xt?wjB3g9GQ{MqeWHrG+RLh((1Gb#+_IQZA+hbFZE03V3{{;rFc<@H^y8?k z1-DTA779Pws<2AswkRcu?cTa88#s(o36vLQ#t%gw<66d2=w{_|3oAsFGqFBMY`3}e ztzFnaG1s@Cs?9yUg}7A_!)~v}?2(gCx4uiEx;IG%agd}B*0+l4VJCKLY4$MOM(p>N z_JweNc>7CzDhE4k;$>|9@s}g;s_#BdFfceARhcIL0Ev7J$IT=5rXc%TO>-O1Y*r)~ zWQnFS>tWWFjQ~(W(VD@*O5=& zJu6YJ;+7qfpWVOlFI6>-a}bH2la1J14j6joxN6h0xz!1#DVZ^p>x#K8yK3Gj)a~pv zaWv68JgxSZP(bcSO2HECY;VuEwO@OMN@SRuq)Ena_)a zr)v^eBi>ltT|;28#~ks(z}`u?vm6WpX=O>F^BLp9$0?5+^Mh2Q`y>cdT-;^(Dq>Cz zJhBn~x=p#s%8qM%O?XdcMjo8wx-1tYd(%XP4o5z<2_SN$pRFRub{q_v^Cp((a(V+R zI!Gmu73?a5M$xEP*o8g&gI2`SNp5n5ST|nSrn$sy%Ys*uGuFA*;*nGqND3X!%z52f zT6XhdSQ3Sf3gfkA3070Pg;CbDBJ&ayMNIDpC;PsXNO0U_@tI^QgBa>6do-3b%VC%w zl=b5j^iUPw2RnUx(yYZ0C`@tIrM`x{JCYDwfXk8YYC|x=+)o{9l1R?SmB)N_%_)__ zk{cwPV47V7<#J+C+4desT-2rUAKmArQHhjKBWJ(Ox1VZQXK81_$XxI}D6<614n9!Z zM(QM!F_bDf@A%Uh%49GkXVRN4TWMYZIL!!YYBzZlN+>ygkk(X)i_za#*x5c%gF?KdQ>?@h$^4n@?4(%hvQfDdt-HWuJbCP ziNlEo3UQ7x){_evS7o7OkIPk8Tyc}T>rIFw82U=_o)ns~J-pW~EDwi0P=ZDXJQ|HK zZaj$)86!Je92WZGl8|yTmC`(K7+lHbH$^|KU3By8<%Bb=hyYe!;r67tXi=3%%Q_B# zbj>x3DO|BrAjx6eRi|sQU85k)IN?RED^Ju-bOdfdkvVU+FNjQ7n{m5$LeN{xV3PB{A0 ztmXjQKqSBNqT~mef!@G$^u@y1VI%Bti2CS8(c*e=j|3^hj#Qml_N zmoh)ifOsE;Sa1r49+=HN^d7@4H*Qo#3NlU)bMH*l;F4JqLPNPwKA82VM~JqO&&iCF z&T3fMC5@VMjC5iA=!&r{p==5-lFWz83`ql>-9>1i&9Iw=3G+7{f61)rV|ZkCbtT6m z2en#8Xq)%S3bSA?athO4djCJOjWj(#! zv$c$6ZsA^8PDTfF=}wF^x_Y(ZnX#8WhhJ)rXK|;X=Z*VwiX(U3wZ;Q{r22ho=n=~r zJZv_RocfBbI;#1JBkfjFI2q5`Oe0;w5U54UQuy(CpCrM8d@AMaFCY6~siHY&w}xXy5Pnv+ft z+**b+FvtWb=b9X}yBu|PCD`RvUnB-Cx6+wA#kG(ft_E8Jr4qX|4Y?Ukhtb5o9J!s&^?R2&>+*mQw?5}yYLQU033r{~Gmw4M zRLCT{jFc=2^f~EPh%qwDB89hUS&9(f4SuLVZ-motG0(do2d3Q?8kp_4BsE24AQ4LJmvL=QR zEH3_0BYy>(sHEMQqm7uXWw}+Rm3gi{%@_hWz)i&e02lFow6<%FMO6+}a!!4yk)<`E zNNQ4~jkOjQV%uGLz|9{$!OAp_iZaEzeN8x8hP;wOawFwH@6ASTrh%tuM(CsC2A+j2 z>{xO!^JjkqW}fIu2(p2joIh%>6yIu_%qxyKBegPlTr;|z$KI=39Pe?@7m^v5U^pCa zs?eOyCFGSk=B{}wHKyP;%Z%eZRe!TfAIZH>xTc7bXqW9|mIUPV^)(!`0!qkRo=0p^ z0?TVKnkFhoaZ3|OBqVHSf0yT6;-*`K5haQTZ#AE(!klLn1;cN;-;zl@p4GK?6c%x^ zM&!yy=J|LTtcxf<(JQB&wG{4dC{k9sH&L^(Guot9u>8(^#ofzvIi-j?+@x%cwYz~< zWJqpq5*v0S$Qa9xwOGw^%P?KM5x>|{RW5BRYjL-0B$Ih+jLMyMuS%9RIE7F%JA9`j zr9_i73h0EZFvDQ=r@&lpcXlHf3)2+!v5b-?`#+Ix5!04!+3sk9AsJ~tQa-&Ztg*DO z!;Bx|Cal3~5MgHAG0x$Noq=niZQWWIS6rNNj$03UIYCMz^sd#-P8>TL+IrN^P7km=6p%`Lj+|Rrzz72X9Jj zB2ovHDrY}+$y&RZN$N5fBxP;;l?NF$Vi;7!9o@DbS3PQVm6q6{d@*8p?^KgFVOCaKFeLJ&nL`2cl+I7|qV^J3Ad=b^W)fTtzQd7FHKInLp_E7)xHTh8 z20%M~srKc- z*PNp>M+&IuHk?&?t(xUJ#H4mn%`{xD3W)=HimLNI?~sx^;8e#}OKlRvVyiaCW+e3h zVAjo*ygpy<>?4t}v;u&f{Y_;;rP){+3;I)D>EH%6-5Uh8iLGm&1#djrfzzd{PkG8 zw+t7sUIkQ9vQ6m4K9wYBhTtUg>|f#|=EX@khA7IQ47vOH>ru?2^eMK=Q2nbhA+fa9-k0`Ud*Afy zYlTSH=P(V&{p?n9g23SnZ4&6B5)bk?5-@GcsbMlMTLmU!2q^tgq+mDzZVO7L( zyla-jxAz9qOt*=Il4acP8-8Z#O2A8LZwg$Typ0&_cr|QGWYF9J9Iq%JBZed?_Z3OR zwyM9rMh z3(Z!RE&J7E1~&XEwLs9{nrKu>2@AKbe_Fd0yUPygCqe0+HwuE_WT+0Xs; zQr+9zeU>@5=3>X>UR788XhEFUm zanqWlA2Fbpcwh(mz(y)r?q!7Tjzh7yA%BQ_{&f~Ky^y0#b(l*7D?^RXoDunUtL_9W zps$w1XOokP$g{bU3zP~}F&}%k)1_FFcADH2V6F}bCp^=$7cwjhnI2t&Gqrww{g1Up zF_n?k1ch7^D;|5)aNfv_%D*>a2=%EVyCa5dsUw1U%_kNYxd0;`LcpOm{{XGeEPj%BLgNRuB@iav6pB#hKiwU`oQ@Aan5^F=qjMU$ z+rK#V6?Qd_D4e(57+ho5{{YoiL~*sMOqgft6t6pdX{`ftt(4MRvabEHuRwVOcLY?r z#mXRTKv+0Zjt(|swkWqH=L<3`Mf^l2O-U$8N+Yqvj6V19{b;&ut1_*4PZtJNBlTHcy|V@OQ?gU#b^}(f^RYw z<0wy3KOt7%QEZBqs$fXsN5N6Cwr7u8yy%fYqwe~V$@j z#j>}UTWL8M03N?u=Wgt@RcPIaF+7NzAKo@jf4xPtg>E5@$uaK;-hbIKh5oC<{V<&O%Q)@f1^BOT;Bd#0fDn3MUkZQ%sNpozMlR0 ztVCH02F3|p*gRH+y1w7Ex7}#j1dQM6#(5Z_uA)qQ(8yXfS1j@$o%4(EwX$tNrbGfq3+>zR{ z{NF8)-X540I=hKUeoNqjIyEZ}in3Z>Nx{kJbJDb8k>F6Q z3K9dU?82q!UCS^-<3Jq>)aYF$G9|4Z8TBHc@Kg;Dssj-GIq%feO1Tzzr$@Lfu(71n@xyMSK5St~A z;g%mRFk7BRbCXYqM>-UCJBc3OTBxnD@{GeAsXo|>WAv&=<@_J&OtTb4%x?d0L zk;!lKmR5|pKPxK?9x4}&rJ3-LxUuB)&!toS<;}!?T9QL;%O?jv%B(xIb8r)ClhEgJ z^%Yv!dR&!+kf>JLr`5Vsq#HxXhvr}uDd5y5V8hM~8dW<%^u&Z@}Q%A?3Js~hV`*+t`x)(ixoe6 z9C1lBi?boJ2Tb+FRgkFwg-@HP9Rd1wrHXRUI6^r4+;#S%%uSJ-yq&otie#;`0I@#x z9C0XTB(8Jbk)dbr#%j_t)Ug;YtW%N)Qg{^0c6_|0x*fYn=j-&Q`7ab9RauOq7z2^b zAmVSkkXNTlXvCDGq-2F~vybWGA2Gk06D3Z_ogw5uRN&kijjF?M}G9xd@vcsU9MpCWq>mH6zMoa+=_sl zHwKhslH0hV!E&mpELn*joYu{knCc2#-9_u=%eJaT0zcl(UNR(Tr~qA~udO#LY*LSM zwewuV`^LDrbyKyoJ05ClSsqJMDi_)qBW{qf{Ebq-o!TZP^SJjknrL^09DPkwQYzfcBaS$S-UD;}PfyacBv5&)vz?kfvl0IQEg$;Uc-*H9rF)Ew zR)xX;0EsP`{{U^j_G(NHZf_H=&?2&%sNs^fCLp%uKPwE>EZA)3qz%5^fnk{bZclEM zz#@D*E;6yBHkW5{HIxAsAtP)T zC!Fp*G3{LC?vHb)%4D6xt_aGG00$MhYjv>fkmWPTt5ZsrnqBl#pEfCAK1nIKhU4rNF5zN+Hc~`X+n<_5yJ|3tr+R*JJV8BxOS0P z8nJC~u(|F?;C7`emjtc>21oaY@QRA%SrZM6w{GH*ZkSv))WdrFQXTB%k(!N4pOCXKyD5JF`KM#5OB+ZtI#^rShg=k>#<;Zh$PvGU;L zVv!Y$o5uv4fG9<#U1mmya=MUC0Toh71Tm)mbHLlkCawaB0UJptkb|abrNk^@ToJVo z-UFoxl`TiZ8UiE(b_w8m)Y5rN$}ZO23E#%!^{6f3@|}cIw$?euIM1~->Rwlu9J6k7 zz&QKd_03GL(3sh74g1L3x0jx`Eph1j-akaD9HEy}{zQiNqiMk<6H z?)n<9Ik}SVJ%BQfTXT92{{UKXOJO(EM^#;p>xq$u;><{qDlae>m3QPh$Oo{i6Yr2^ z!7O_d+Oys%A(7>Wm1B~wG8gD+#S-3zn=De>CRceY^LzT$86`n*#y1L}<8E_P=xN?ev%ISmIY|Ce zz;WsP>a;~6(+G^7Y(;{9b@?Od>rdL!M(W&0pS%S@5}4hVNXhI0t(j(yTOy$sbB)D& z@z$lgY&5P`uylnafxyh81LYibsG!m0ySi~4VRo+WSPc6QT8xFSYL%{%?tQcHH-ou4 zD56P>&oq(9L1DXL{vV*N6DC@eOiHi_=5Lssl>X?g6xlm(R(-h6Gs&uozHR2<(cyma zbAgZXty_Qem}OKW%Tc~Qe(CT0=}C?#tmhlz2j$zGdexmi*?f~O@#T&>@=aEVFWMoA zo39z^$3y(;TRBTBNT7npZo|iVPgGJ`lbJlM5%;PB?%raI1?}zYQ%)4eaj<;EVOpv{ z^G4!H7!_}nj{g9K2@Y}_dB((zhV$1ws?EG_=SHa?mz6v-)Q$`)5=g@-b(3)%RBI%L zY=;H&!Efb5Y?~6ut+=BM$Hw3GfBMwNm{sn^PGxgR{}U4RI&rc z2L`2y{M*Q-mS%0x%N7FgeqQt`U4qzK6@uR3;vrRT;t|MT>cDfK&Zm`@;n5Q!28dxj zF;rqydkDO|Ex`Mk=Z@p+T5(OaL!iSq$}!fZ=*7?%5nJp<3X8(~Qrbl{? z@1Q0A431ysd82g!Sa1hZ`ijca5t8M{1n>--YjvD)cePY{80H#GYU7m!UPA71G=-z*ZsQ&&me`{{RrHQaVI3aQPSu zo(FtXv|22gB#+DoyNBz*^%W~z=R3Mc#==m+LUE63 ze5oO($}|4%BC(Fjp>fEo_C&>RwUzv}kIgc4O!|A8tg+0~i;tYixP?q)wntiY7XEZ^ zX|`biQ&A< zpTs*gBvPZ>n5Q$6tadbHb5yii9P?W0G*59J_iQlR@{h-@HWRk*?E~)^$Ymz9p*KOm zwqi24X%FzPpfwcMmhlU@H-$m89nF%*xE{RH=QNBih5}gxf-$@ufPZ_v0{t!Qni*}& zMnf0v%X@oM>`O9}u!U(iE1shRwNun#Ht@);F8Q2-p6WXRQMqx@Si(RhSkQc}j=d@{ zh;5$X&RJM&LEf5<@><3WOdO4;@b6A!SgqCMT>RL^d8^dHty@G>ONk4AtW%SRH5@L= zK|FhQsOFa{xF3aFvw~S81?}T+F-csvAXb|rJ=u2N6fiW`CovWZnIvV8r9*V^tcD1q zRoTY;htel0_)}#gs9}60m*D!HLgMD>n8sJ^uig7yvQ(MmkhNN$AaF zn$;8BJKX8;T#`h@<-%YL0ZFHw?Wj5<`FldQnZS0(^s0ae;gm>G)lN5$;r6KB{&^Hd z8B+>3%dl-EpI=&5H&WA5++qW6WN>${C5hu7N{kYb0uh;U!KOmVZE~XKCK2N%GD#+< zSw{BxFuj#|0;=rBX$@%3#4iy6zG0eQFqPT>QRpenaGP_U3C0MwrIFa27Di%pqwmDs|+!|;Z zW(a|0P>Yu24Dn548mSK3N%b5GcIaWif!yslW*HxqP7>PwciAkk#=mqF-hUiaH*G@Z ze2{EAB4Xt7UlA^~13u8i4{m9xaW?V?-8lJbm%Ji7xzzEuXgqCN zg__wHWGl6gt5WI`?zwD`ocF2|hYSpASf5VSqK<^g+d}Xt(FDL^JBnLi@yZIiD=F?X zP-#f_X6;*U*i>#-;Gd;B$#nO)*#3BK`C!0OdKsv;$!E-x?v>R5Ibyt2lT8eO+*j%ct!f12z6*$POmXO52xGoo}88(V^k2L6>QX=~k{b|KC$9+wk`>itKHG&&? zVrcplP^M68S4LEg*k`CUwJ3^Vu#k(_AY{@(ZdTowc7NI#;)NpIN)Rk?N4@dC(A8Us zLq5Rcf8N`kaZSM!C}rKAxHUXhutPbW)VXdkx|W133eXm52q&7AqLk%4vbY?kZj79OeT80y2|>JIRV@gx zf(gmbn~|SNwyQHPFm}^ObzR7X!1SiBqbw3;AbrDhsF2W&5;k)CX~tl&d2IWGb;Us1 zlA%>-k=fe-D(wQCeCwuVf+CU*pfKn2skd^pCu3!~TtZ0Mf;aVVTCXfIqN_?BUvJ6} zdapb;R?4O0*m~kYQV%ZX9oJK&D2F*5W~pcn))`D`;Y5YCYj8}AhED?>(>Rn=``NTY=r zg0r0dc%+L@dw5Ly*eU+^@qH@2t>l(kv{zB(wjAvupG^TRbu8=pXZAj`JTo|*$=s4B z$_T8ouPQe8Biwebuf_V(YnQh3qGD*22UlEz2*@?g&gI{j(LEIgD_3pI>}7P+s1Nuq)8*(;}Q?NF-Z}5t{n>m zgC_tl^`(_!R0uzxCX=?vEFqll!nvukn|@|qD>&nK%KRPxBCbM&N0|sNJFf#2tObx9 zjiB+@6%>~l+L10w^#i3ymoPLZeWHfx`?5~&ergum9KL2gTAKi}PKCBe%?rN)O4Vpa z?Z}(Sk1LrlNv#O=SzgZK$Ot5gWegqQ`QtsdwL-;)3nGah?-?K_%O(gnuqJ$r&fDT9mhzv7v@RjFK{?8mm3AAhV6l`Sa^d38FzR@)_-DCIk`ZoEo8>!*9je%hY1DZ!eCzW*NAUWJ zbC{%M4H(Mvnw>5oibhou#y-DHyUVSrEo_OO9n{cY@_lNk@-KgBNK1qnU!z1}_ z2135w-kJ8zT(=Syirw34xY)9E$9_+xU)12WOGcE(A)L$^SSj0%Nd)%mRU$rX3;>~c zJ*ZMrS0!l2wZlfSzU;}!f0vJ8Ma+BP5~3CoNIb2dH%0k}=SAdiV>nB*h{~@+{3nn_ zSGbCMYkxA{Up6TBILXEV^!KG0r>)Dl<%zDLKXIi_N(RBlek(pmUhTuGk(?-RaaP3B z&u1I#%&NOl$mi=;(%eZR05KG+k%RuOPwtxD3r$JTOH}fqSmk0^kDRxyK`peiGYE)Q zUzv0IQ2^O9hvBin&r_b1Y7<-~vqB;cR}1p`Xa4}NS;<;jgDP6hJ{^)>xH)Ve;`FOY zF2=;1AW(1zdX9P7r$v?+)5=j6_|s{VAH>7gwO9aLL=*w@3}d_ zY?9`$Lk!Wgqg>m#QMkrr3%P&U^{SJrrpVnwn8_K>S|(pU-F($RoQTT|fNEsMhHgG* znY4m>j8(*BFr<5M58rQJYKCDgwCx4VNRGdFr##}IY4R0wxZ|j#8y2UyC0}7-{6BlI z;rLZGo!wL*83!ege`;$?%G*N}Y-fM97HTAU9vK+4Yn3VnGoIl4W~#!y$q|-ohFP|w z0rL_ug{RAEwsS~#GcWM_)BN_C(OYrZ83IS>2c=3bJin96-~1$$#y*r>G!=f%V2wj% zfWdrlO*7178wS=LdQ?(eZC{;V+XA8vrG9AwvEv4%CdBkE#yt0Zsy2krH6%93_Yx_Q zW7+}8JBMujYMrgbkV!H`Kn{ftM^!y)+fXBVZ%Y)Az)uU-5lU5~o5VAaGB~9dD zjO6y>lHuZ!kUmHts3Wne!PZGx6+`1KyFLAB(Xyfz&UZFyE!1t9*Ag@{Alb8baNS4a zT2}AFM#!UU1<3E{aaFD?+U`exG>z;tA3&gbA4;_tSffm-X3tUST67dqdev3Yzj$MO zAG`cj9?}xv1wL7sw+;ON0G=ulFO);3a>V03G6htM@nv=Zr9SplB)O;B=&4}rOG9_G;#Tb2q1MH`K3Uys4M}= z>+eH((XVv;3UmPXtMP6?!vLFY)#xPj1vnu#{YlgQ5k zr`;1pHr_CpWzK&ZR@AEy#7XklaB+e<~SM>{e1xX*gEsxy{_k-yJ%*@DM{d)23qTeD%Z zF^%0tQI1&=n7fA0@S2W!i9@*Y)Ku$nCQ2t~aKP;PRG^0Y#{_h$$^NWyijk!Xpdcl& ziq%9{xDcLT#&h1CEGr;T2m`%C8*e+002~^yjH7d70Pr}W=pw7bwTM%d&jzQ0FgbT6 zG4l?U6pVKq41wHpNfWq}DsWG=EzpMCqVC$94!F%g6sbQy9+f4@W_BujRdE~R=4_g1 z9;JW~o$X9%neY|5QvI4uqshzUa!)}{i8e?QMdW%?Y!WV6fXlFfkaq?@%Ab8VnB|pr zgW8~LjF70E_@`}?JaP{eCG@b%c(EYMfDc?%>6kjJNAn;jlh(76Dc z-zXSgYQG3W0~bHMR>4nSdVDZLCRxr|1!Y#dgp+=e|{cF+x23UQ2&YR#32xZDO6d)Ae#QcoBFC?~TTb|r3Ij^At|LWP3i zatD7;tyh-f2a|7^g$KCiwJq;8YkM1OJ7%{LoklsKY@CS{9=_B|q+j0Z7g~nvG@41- zpD-31yO1b^B-wJ3wuKvgHrn?@)GnIkQfqPhsGA=-$j`rec9m)bCIU(GCRWJ*0Ck7; z6@7G#P2IO@kjLNTwN;>KS1zZ_)-C|_MGKnULPk8pgc(@UJj9WYnaKR=Mvo_%?fa}V zjt^S9Jl4`YO%&LNy%^yR+@ydq zFvA1T)nt)lKot}=cyZRG^Kz@UG#Ks&CbE*bq*_MQ0_RK9uOhThJ{z5^2mJEJbDzM~ zO-|QO)U0eC#K#@dsaH?p>P|6FwbY#?T*nxX7%HQR(|w6-*GX>y^7;MTOS=`%D9JN) z(?=<)=+fz?)m`^2mg_J%>74%ng;BQHOEc~RZ%@X$J%3wEi!DAoStFHIZK1d>oUZ39 zj()V*yfJVm)fF5IYejH`^&k=Q)U?KP*CqwGmpr>?X!WdkZhW}pje%y6{{VZB%Cs-Q zj^TF#Ha7>c6-ndrN6Y{ue5W0ASle3?^+G3^QdUwYmWFlE(g}9bZBP?t^;Qn?OChZ%+pQLBx@)nVOe-?c+b+LDy*!S z0S{lTBt}v`cUK*h3XW;^VYXx*2s~6K?_$v1vMh5Tbl!ec_VqPRJ5U}aXWh4j-Nq`_ zw2JKD4&6wkXndlQZXwsG>}yVII}r8MY?i27<>30~Kdo4|fi7oMe57RyWmhMPf%fhU z;AHYS;EJ)J;VzxN(qU+ooms})fDeS*5$tc{}`VXxo zyhF}VBgu9FRk{q*?kt6r$LB>{;KoTT8m{YN3ZhmLO}HdO?`1|XdVZ#$j3REqIUNb= zYVQ94GC2_=84H$Uj(Os$z@BF1U_i$wy;5dvJD2TY#CzB?gaq%$a5Ma=L~gN=$7yVw z^#-AdrjA0<&dREVm_I_j5BO0&sIs%jh*8yXzjk+efGRa+oeH->!$;+U+ql6PBBN-~ z5y=hGOrA6CQC_f_+T|WX&$)`8i&SIY;HZ3XP6^|IpXpky5tCY-1QzKKS*7JyEC3wm z98|WKmT}og9%&53k|G0YgYQ&eGTqv}-oX&t<*w%Ar#$`@Qs&(j))q&RWFs;Hd-lam zucIpI7dIQrfU%(8xG&0hA6(UoOvkACG9hq{v?7jJp7nq(KhlFrSg|8<4UGM3q+6tD z)?{t69ybAwJ5@u%p@ygi3?&BEe{PwK2)Ta`Ko{aAqYl0&Ksq3F=o=5~yZB9T@ zhXVxjQ$4C&L{v7(LvIb+{{YoZ5zB6-M_k1j3eD~7R-MC@7`N842<3ED+Kd_b#xO@7 z{cBFd%q`Q%P^w7{k=xd^gko5CDkQ6o{EnlpY7-alVrY}g5;o918jgGD3O!K_cHVS2 zTrUS>U=vV~VP{rfFr*>(fq}Sntu=omiCvYKs`-6ADmMy-RF+6gAeJP8dsS*gkgCqwBt~Z4 zByEWgpeKP?_bA~9mnm210X1?YmQySk8(DF5knP!QvC=l#-nAB9zmG0Tj(jlSq{jl=0zVRV$U?*drKXtxZ8 z`uor;+^sC_kfkvn@Z@v()vY(nGa_zo9G$&*Dr&97A`thG+--tlG8g6m{{VCfeURZS<;E_i)->$rQGLUFY{@&N%PWzokOn$RC+3Vr93ARcTq( zcVu!7PBHbUtYV%dc-AQVxD=C)2+7YrwP4-(-gbCxf#Z!AkPCkxGf}0&q!%&2%_^=G zry;q>$*0OzgCa`D_f8ppzgn5u6rJ@#+cJ4K0%v6^fJ7I&6V7Ria}vVrWGrJGu6k5b z#~8IN@~~)=djWypzu`oY!*g+kjHGP@eqPRuPZ^~h#VgqKi$>L~d@}GyCA9Lyi9C_0 zZ#bR~(Tt9OSD)NnE&jQGsLvL|H06qSPBHx}&@8+=sOWa>7;BkYYc%uLIY#Lx$pEoG z!;ebyeMVB+t?!u2xJ0zqllGTj{{T5sqjBT$tJn8N?@s#^Yqk8oN{klg zipsZvSxl~*OlZybPSzOtPJMG)7L!90+nEDRakYbO9r3{WRufj#+MDQ9Sm0a7h8Rp9 zTt#sI05cZA&OWq6O35T&F;bxyo^mQ!H%v;fA!K$YPwtOHRW2ZVX|3G2o-(_O#yWG- zl6?l!-5ORcD#`Qz005MI1|mLjw>?MaRo3Q5xFM0_--5Xq2cDG}ics>q zI|W4pYYsUkrK=B_u~i6Yrz$4en-u}&fvU}O1+3^9-@7tDP(3Q!!uC+jb@#}GDk`23 zlib!#%B9WJ5{8VG;j!ABbz|9TZ9c0Jv|l0%g^0G%&l&ZnM|vc)-E$d_$zsaO7;WC? zHD6GXaKZckh2M)P!Ow=tQfw!_TEa6lwUG70}#@E(?qnl*ODOQa~JXHF9 zm9(zyaO$leA~1RnQh%jTwzOE5d106LvJ4e`@%}XMSsf4{+z0UW%}ZSkqSGe4y_8C_ z+q`O#WDSXf%|T#VTZ3|BShlIz&&VU zSF$v<-N)QoqO6GRmIEwToK{tsNUg&&Gf1uSk5kyxmlvxNwZM0f1jY&KY4%?@S!Ra{ z56S~$G?Hlwenp=--JWv-0WH6R3 z(Bi2`%&qbbqt>jVJI15rlfD7zM&i;mOBrX*(1CJuT16yFDl6ttjM;o=6)aN5Bv$g) zL?6O>AIhe?vXTd2_U#Hl%gnXgTrYeg( ziUSjXeR`VEK`IhfWI>F!LC@wZGUXLnUvm{>)Mcwky-AWRL3J!n9vA)K=yBe!$0N(8 z%Qei>ik^Tn9*Nis-B-WT29POva>1# zRuX_o`=C?C!g0Gk)j%r+3Z|uo30_T%rG3j)wPX#K#8@AH6;JH|lCH`yam8CUutuaL z-1@U;HA;P~10SFwr7>K7Vm}Xd8v!JGTg8sd6Fh*vQXqQt`Fl{woh#CtvN)r!eP1%DPOvTaryNWVUW!gYs;ed>0J#i3Dbd7Kval<( za0=5i8D%b{ANFaYcV>k1r0}cBaVFt~-6HN493f+y%|(gG=ubbTH+)-IBoc_dHv%*3 zOom2GPHl{lrgM(pl^TSR9NXk~b|5x-V;HByEwE%sRGc0dHx*ayMXxvcc--5wQc1Df z?&qG1)|{2h%g=o@B(xs>rdvTQ80=&T4P79jaXyk<=PjNDCfxjO@-sWcpHqLykv3!%#$4 zM+<@g{4rLD8`ZE2FLF9mw_>+LjBg#cp3WI}ZEOr;t=YpmSxqExk19+nIbTj{0WSDR z^3|8VI+~VQ?%K%O`I18$bA_{CH?JfLx z=WQ%YBr+1QdBY!YwJfOYv2O)NLmoO0!mVgd>__rtXEDA4@G6vyp~pd2<$p1QDF9=P zbf_62E`sGyM4d?M%?d+kdWCViHdYJws#u~#PBFLCQ(n%*?Pw;FIMx0@!=1l{SvK&; zA_&=nu^WT(RMa^kBq4txi^d(04gozXSmcgmk`2+uUyxkw+IrO#e>OKNbCw}mtGde@ zD@vOalat>Rk}Br2YgZ;1;WD6&9}bcg8^PmuzAA)oBAH_-xO0u_-z%s*5ly#mH*<`Y zG37xYrE6O(Ho8OGDm$|(`FoUQdF`5ZirEXyr!BnmkszGP6?Pchf(rJ*Jk$;*YgV`N z&SV*5j8kT2icP!9ahA&Gy-Zq27{=wKanK*-O)i7nWRZPnWL4#L5wi|FR1mYB(w9NmR4UbE$FK$ z8%&L!Hqv6$lG57dNhOGHcZ{f119ly|{#6W;vK1fd*d3%Ep}W=Vt6%Jy1Vbbr-oC!Sa ziw9ugLU*bU4a2y)2_nW&F=an=16>yS0#NtiLxQ|g(9%V@J!N+=R zGkMcGVX~ln!;$o&+5%+na*Pra9`ZOO{{Rg<#;0h;PDLw07z%?XR|g<}54}k&TVYmW z+u5+V9cW#VR=EUr*8)Q2OKuzxdI|u-Ks~=imC{AnM&Nq)s|G)^1@aNugKWs|57w&9 z1lCuxLvtHPG;$Pmae}NnaZcUD^w5-I7~P~KF~9>HRNiBhSl839darwRZ7dkt!HV)v zL+wz-ej{80jLJa6E=SPO$Sb61B-<_(U4nu-nx5f^WS@EzZ1ciIJl|P}`RIeREp3GsiSi z=W0k7ZoD!(e|Fc zzSTX$>>?L(QP+cvobi$RQ?=HGCPeDM#J*q5<0>i1vLS%Jk4ji1RLJT8=qkCN za0;k*kGykHU~LJX-fsg1zHC+KL&-F8JBau2rMfTHsWg6K4#jyQr$l^@m;?2u)z+hq zq>mazHngFnWzOEaPo+g9k{fben+x)T?M@qAy42>%$M$iXS%`Efn^CU>f&jfDc+N2A$%SZrCyO%xcJ&PdT%s|K7^)#O2 zTLPS0ysX_uHshLn)1}cJko3Y;SfxQ5#$R-OMg=)eW5_3Blfe{QTa(N;VTq1EO0l?@ z_BFYUnMeNsuB$^Yn=2EXGh<=&;Df;4=@)-MJ zsR-P}?igO(>K`ekZ=PZYzce7+lgtPVj-zSdW|{In3TJn{K1*<_GIx4=QyvzIE%D$s zaf45AE~pXP2|U%xx7_bdSz2)>GQme&R(vZfGX}{X^?K86ZY52@hXfN!!%UjeG--#L z*!tDUCrIMjs6XIK&*4?xbPc$n&$T05tc$l|k(`6vR5f-|b|AUFl_CzYWbVX+ILg$L zO&^$A+jd*;Zar#ZvCf1nc_*CJa$!IN5uZ>h;cdfGHl3Z|Su$6682oDOk!6w9$t8Fh zs(6N2#HlCE9Eu8&?m4R#*HJz2Y~T`8C#N*zG|1THnUCJj&FpAL-#Gx}1J@Mn$8B`# z`5|&SJm#PKndf$evHPv(Jbg_Gi>Y5Rq_exStL@-=)n&Wd+&Dh@sbsnHt^Af$KkWfj zj9_-fK3?R>lg)GIg;^Yx>@!uQiF26FJJTL445`LVGI>CYkG;~ZM#e3QWkB5Vp0#p0 z(lD;wyVg3UU#3B=iv))4niAwNk8+Rpf&Tys*OZklOV+D(Ja;z;`vts4bOd!O#cNVc zOI6eK%Or*frdx>2&hGeN#PTXlMp$g_ZlZ)Ew4QWQE^&os4nLhwY?4QB4EO&4GGHtg z{oni+g}#GqmriSo%Q+{xRDggCL+1VOuG*Vo8+MVd0DRy^2Pgb0ytnc>+*Aj~+>U^H z4{B^CTbVW>0Db0T(O1;g=z_?OKe9BwbD0&fuqPj#GdAMaahc5d`++`OeQLb-oAC@I z5GUQwRzKc8^#OD3nyV6N7PK~jgtHr*{oD_@>!&?vhgirOTdzNQhURlYf!whzU zIivgYD-vY-j+GtF{hYQiLY8R^vHj_hj_E$2)DcMvD+w)&OLP|)<)rjEcv)PZC zy+A{q!)V6SR`mfj!A(L`j0>VpV?V?=Y>vmhD-8wKqXh9pt8E-D7{Pe>-}R?Usieso zt-NuRA9*<4^{6c_UdmxAMHGkovLIk;83>j}a*CUdaJ>a4g675L+@jnipzx{feTXAm7hTjtn(YXF0S-6@h8&{TN zSA;dFMPSPzj~{r0kVnwguZJg*blqy&?ZY>a@);S1?!4}Ej^F3#zLF!OG|Qisdeu;Q+gaV}wueuBk5n0J5fQ zX=jy2?7E1!Z`a$cHakmgL7`=ZJke?%GRw#+$Dh)(alWj2J&shpz_z!NSXersc0aK?H){XxKz3FBH~0;+z;I^z)|b;sIP9fTDpfj z8x8WGy^rZxq+6DR#Xyf^;53=z&{Zp&t8yMcyoX8sAG`| zZHbwkLB>65+)S^jFp1<};@UcN9FJ;^LoA^4*ChMYFkJclW;rc`_4li5A|~bD3yvFi zZ+e@wTGb2XBBHal**9*%BxmbYZV6@Bfaf{DrrpWDK_>IJ_?_D%{uQ40C^t#Ya8*8) zNnBU9gpp1pxdjooleFcqxb;6u(z%GMq+VjWvgMfIkEpDOBUg;bSmgENrCGPQK5U;W z`EAFWj&tozpo=7%Dons2^4|pr9ExwB_IuCYs`-3;k+%#7tyyoLAo2`=-A`JlcLL5f z#(g>u&V)8z^FGwZXvBYi<=8&x^c5A%D)#5dgqMk%a8Nq6Lky-cAw)kY`>V!jit|~> z?xodS?sa~1@A=YEcN)~KExd_@c*?UKxh=(1`zyxZbTC#5Ks=h!c~b8@Knbm|8WF ziA?;fyB~qgTl-vNP>rI84Im*x3}@D{TwGmUq^~PB-MCW0KpEz?Zgmf|ys+A)$?cJ3 zR?of){{Y`Kt=*7`F{IXr$7v_#UOwrnbK5uCJl->%tV4DG0PEA^cZS=^Y!u!I z$86Lh<(A>$blkb+Pk(y56Cx=i+&^{6UU)yHX|qWkvpOI3j2M(YtBTILRDu~Ng=E@< z1GAD3y-Q^*ZKp>hvb%!#W$YB5IHKARu2T>hB9Y7d(XyZi(x$c^Ru?>zfmSD7o@)Kp zR>x1h=xVBbu}AyNxCe6;?~%ZxplC|(_Ga>Cgcu%8lAsw2{?%3%jcI&gnRD}~Nje5o z(AAWi%?}*1sGEM(Bw#org`#-aMdh#{xWYI0SZ4$1O)*Aj=cU=YS>YeNI9^bWqw(od-dkMT*}TR!X!+%{ zgV8XGKoTj7-T(uwNhDxMi{19JefFFjY5_0%}vN< zcrGPtKb!$GP5Z|n_UVe&j_j_QLc7HnIC23z`hGQ^EKhR@oIHD3xo4CDKMF;-drdi` zS5dxRK5jwBT86F(v`cF|(px>FMH?)V1l~q5k;o>Q1e3#eXtOWwEo8}xBf;cx?kY&G z=6M-08FpbQEI190YB{DY5oT6aY!x`hew4Ya?oxL>JH$RNx45xM>{XiM%xHia7(>dp z=qt$JAV^qw=Cw2TUyJ1Y|_Y)>;vwH-cAmGyuU4F&ombI6A^5ZPO<Gh|~vySNq&&HE!u3m&>=b zaIGVN2|bTr%Bsn6`@A3{b7Uw#A@`-SQ!5BdgDZw@%rUnEr6#(tPoZD|ad#`Ja}G|# z9YT)e`_$#;hSk}k5~heyqhw11BZ2$Xma0j;5-{%^YRd%nHYY?bfVYp$;HF zDQsbnYO|=o=^SvZ!R2q83OWyJ()MP2jiSA_rcsES9!d02O3GR*_FW(*wQ=cPz3m8TI(yA?)WM?Wg@?N#oid!^dv z{AyV4=7Qcx<8hY2bJrr1y^Yf;UD`=5#fDW#JiuML3=geMCFD0ZHu5S+A&GKO3}jQx z!R8GzhXKQd?V4(oX{BOh1(#?GfWM6`dWFzwtp((dcYidI$nAs-djnbm>PCxXFCNJY zZNmQm4Py%#S~Vfn5a&D3U!_~KFw1ggF~}W8%oF#HIQ%I}JxO+Z5|APrS%7?gH66nl zR798~Z#?v;Pv(urOew}NDx8WJ955aEp&7DBl%^FV*u(g}Dp;MQouo;ex*Yk7!9Mj( z=HI_+06PlM)1Zb3VUqE}dF1Rse}s1?ox;xIPb&E=S>$;Ja6i0ih?ZE~O|gka9eC!n zM93ap!TI@B6Yn?pRF{z3Et+Nj0C-nB3-Owp*t;^hX)iz001MAoec#TeR=aqe9|1wm z@^C)29gdxI<-pe=S}6|nRpzW-Na)WAMc`zH;8eLQ=v|s36NvYDkMiR<2ZpNBO%2_| z5XTeAB#Kxn4h2a&D38vKQKZL4=qZ*n-bf~47~G0->GF`jUe!ryavQedJhOwfBJ$OE z=~7{u7j~J;{{VD%sf3%YkU3rW?b@JO4*mPpBOiNp3haq6TUllbBajfor+oTT?X+2o ztd77ENWnc1siltcIHy@7m9`GVVxJ}S*BMKHIpt%X6AGj2Rd;(C-IT9kMY!6Md8RXl zUcO{LSl^V9)%hUqAFWo!A}=^bo>){0%h+>B z5XEgHG0DiNCVaUdjoo;up7-q$s!F*F-k3JVt;}!1<%b;8M6NznVh6P;7TFjK4myES zHVkTu&N!j9eFT-qO#umn!X>tjMmroyG`J^fc%kqT~=8 z)}FDv<?(w_%2#A7<>d4OCaf1=e309={q4f9LlQ=- zA&o%CZYlgEG>pg+NaR;--Flk5bu#6UNgDC<40HHX?qgZO1NeunWHe6;fX9LO)oAqv zb7kaPy)l*D)F&#R)}s3=kh$sHnwH+?I3Q*jGGpZ%!v2)mH7v452tH>_FFEuz7Gt=u zByliTk91kd`MDoTpJX9+33&BiN`^IpOO3Y+CzX$wu5wpdUVSox`bDKvju2$f7xyX%n43~13y+s(cYSGNni;ZR5pj{T{3?ZEcW?Pi@O^lxyyhHA zzS*kF6J5AS-2v;koul=sgtpPdVr6DUJdndD@ugyhG?T{ObW$ApZK;cHvGd*%qjujg zo*2{-U2n>v0Otzt{c7V0Eq?J$%lv=CyV{kCLRwif&%bgkU}T0F>FH0JUoeu$uEUiW zy0l?P}!J zE0Qr&D-Tk2^IBNMapb#g&h7y9r#|FtZ*9lEGf~SUMiM1fWDTA;&(f9|6e_m&$6)7? zP#cLP%>ti3M=ir9(8PW|l#^XF*5WlgTliQU5mSq_K&^szN%Ld-wV1aK?D1`l)V6R_ zBz+HRsHV)>tqMmJDy73W^Qk4apD8H}QPXx&cplYK1YoLVbPGI>DZ%T&j>}?%N5k_J!=6S5Yz)mn~TNf(3r$F!5C!A7TB)6Voq!M9zykMQHQ!Ae- zGa`lizT{B#6}2r*U`$DIZoeV{%yv-m(@?>DIwO2=C_$Rk| zj!7SR+k;k--5tp!@$-3}W_a{9TuCCTrqB$Fxl42+pKEYo z3}j8kkcP)&L%P_ZODs0*1IBPd@GyV+^gP!K61M3UJ)3^{BOi}Ss*H*E3dKWUel=d^^}feu<9)*{#j*xK0Qyz7StZ#C zjTa*;xL~k7=_8ZMR8$*gj5o}k@;TzPjk=Jond!bgx3<+ZXVxu(M{dasVU%$!nFOD~ zeqXMAE6e7MW1G&w2tNKjD}PXzRn#oBO+BTywua^%ks!EkamMCT!sPmQsy24e-Lt_w zF7byd!yKy)6#X(PrFN4N-o{!=_csA{Zi{Ol_i@mgk|-U7VrX|VE3~i{+q(sDqw}ZW zK`p{2(9G+cE&*KgQ^@{B)w{k#DPiUwOZ5>l-F+4Il#y8k?B`$Cw7NCi4*4M>sBx1vU__*fy+R# z?PX}fy1q&0*RNWk3O&>gp}eM$@>t`&UntQn#lbetK4}nq;{;`M){$mryh$YsE_ZG+ z9(f&eP1LPpv5qL!i*Mb(JQ2_GHBG}waK1<_5kJIweQ62HQvKPtXiy;e)Mq*2k8IUZ zZRJV!jH`{=8-K&KHbh|Tj!8|uJqC<_gg`f8p0!p@?<11C0}w)vK&WWDv}l{Ri{uV=x6QW$=}%e1kYrFjs)g#c zy{_k09>?^iY0u`XXNAJ&C+?H|D7JxjbAB8ZcR0>CCYdscWZvvqo0IHwRq|s#NnilT z#Z4m!;*FRt3EBz@OG8C*OiCeX8*n3zdeawr+O8WiGaTgLbfF}2uE^s7a6|f3Pd4Dc zbd}Fh-j=O{Wti70NmI9IEPV;>S1coq%+J7Zx$DL;(x}G_Kr6oUyC;^Y{Im~%**NTY zs(Pyx7h%`3{iGP9VUh+^@yO_ESxk*%ZN4y~0ga>UQF-y~WMHbqai8&~v=WGh}t-dNkXe9R3#elCOB{G#LK zbzYe7TCGu&V{H}p?m|XSQP!+F$fydHBkwRLtr4yiE(gi@Qw*jknj{LlllYHn&AmgQ zE&nJvko$DMY zaqm==Dim+b8l1>J&ci$t)OM<2akASKo91#DVEnxZBO;?O&IU3&QY4+yK-l1#00ux< zRT9{$K2Q&TN>LJt0B1g!#Uw|}P~hXA%8{g5Bq10v9<@mVEO~pEjy8_kA(avaSS0609 zROhd1jV{qf*G|0{bM>bvX2Je^(*?YFY-AJ8MM_97scc`xZtyPbz{C?D@u?%P(xJV$ zX=f6Rr)eF#Raq@vrxKmcSH5_~O31rNe5oRyh#Z+d(-hMx0Lt9s9rIOB?=J4`RagWk zpF>xcNdOH1kA4)XVbJ58)k{08r=8w=W?wLrIpZy#ygjPZ&@Lpd)7!}@2a)%Wtz%p2 zHyK2n$+j)XxjgMBBzxBENh7?1U@*_MQ~T9}c@NoTXx26eI%hR-NX|{%oo|ayWK)D# zn6jSrVJ>2oGNS_o_3K!0$alyNNAArt&U6aOdq*a7kaGEH5zn%$+jOUT5ZXtgUGil24L{HOG*QL!zS>|usm4N$!6=0N~x zm3iF96;DetM|37^kOl=TcD`api2x6|Z^i>T@7mzNRS`3}&=v4A)q zdvQJUT-CgnmZV=>q>3YbqENjUj-Oh!q4m3icsG6>|iJ`=jz zBl)1ZsuF*^2Ha=g6}LZ$wT)cCY}ypIutv&vJi)Prz#Ro>=Iv%?qLaEbe9w@dYTwq5 z+Y6N+!9B450LT@}=3^PkVEt>S*LMqDNDtw=wjcQhu$Jb_V;0s2yPQOl!i0{pzQOs_ z>!nNffRf!=s%Hh(;amFFZ`Yb>0QMDRn%d@>Z>32RL8(W1<}UA%e(%(SRP5tRku+~` zA}JF*dFaePhw-XpdT~k6Pz1Z`-kon{buo_W@vX8xR@s&OO*c}rODmLErV~XZODZ_@ zBcb#nshnb@W9JDJ&WR@VR={j^t2UQvRNLI{Ht*bZL;c}YbMHzIYE2C?-0?|mFP7vU z;fF>ZqNTUHcv~Q4dx~3mqq&7_?q6;X?j=9Kf4bl7pVF(k$pp_7s8Q75gHX|Fo2c<$ z1p7hH8&${zr{`I>yY%bgOm_;*n*@nI0bgtijtho~vb{YkOiK;jv4&+Y7q?VC)r_KQ znjdvF=B42{CVREFu#RpR}BoW8cZV&qwfR7Sa8c}6fzRc5`mDo<0tx)S<6A8uPBx%uAMxu zyqh!0BB8mKS$KdF6z?Do^)!hRTe-&KTb%m((;$_V;fDY#hGIv5N)j0Oh&JT9tdlpQ z;4kIYuAvi0=C_tzPUjLf1~cE|@wcPn!pfdiA8o zuR_h#f)*-a^H+A=z~h>)Z4MQ~B7jagJ*rLYVQv`r!1buS^9pQk19ddL1U5+Rpmu0W zDs0J9{pyAnmR8(0hHin52j@{-Tr$YOfw8@_ihx`f1yFpafkKN8VQ%F+Kng%pv;uQg z{I_`VoDW*H4YSNjwg%K@IX<1KVT;OK$W*Q~!RTvCSrYXnPxNb1=H%cEsOZP(P{zs> zvk82&!XWjg$s5G3;1SRLTxA8L}~;X=h` zpFD*^j5n#t6?Ni{G$A9{I_+{Wzf)AFnmBbemNwq8@KskldJkF?9>WsaXbLXC**Fn{ z$f@PGhB;PiZ9HXxQ^79lKD4xP^z#nzR>+c4$WBc-&_|`xkeqwUwXd*x_e-dqZscQ zVgM#fHg^H-(yCEwQ6^oqo>o+mlWMA+n;b8j&nkYns5L-PnPr!B^00PD&UWXVe@ce) z%m}FV?$aX`OK`Ys`wG&&wtJg+Ev_Ok%Ni@k`>(&abS=`SP0MHUhFK(M-72uiTvh!=cibdQ>{>$7mJN=3 zno_fNLyEF5-9Qbig(QrtZuH&H6-^FeGCS_cSjjz4r7e`|Z9Zo^2gv#9$3yv3Z-U1T z94H%$0)z4t(rZGST?yrMg+hRO9OKrjOCv=zH!I8^>f{*7Pn7=vGgGzEnqrHxITdi| z!?Ed)aZ$x?kbLgsW9Dp*DW%bqex+?LU$xy9W{r0cgSW3WYHN5zN;h#L?%?3JzvyZU zNF&2Z3T;lvcsZ!`C1ETq{{SMR3}kiVr}L__5`D{S5CMSR>exnZ>cf#rgvm6__}=g3 z@z(&3H4cq5GL ziB&s+)AFj*xHAGBcw8VH=QP!tNYzP~awy5oC3^+ZB)D@l4B?0|j5mLmv8oF?%P2(1 zD#tlqykq)SqD2siq+g#oR{B&IN*C<0=P28m52qZ{wTZC{$`&bM5$R1yg9b@t z59S<&f8qL5B`7^hZNVp|DylJS%m-55V&~=PwMi0a%PA<5Dv<9P1Te@7IO3{V%Nt7^ zvv6S}Y01F^-~v9lt(`jE($~w?6U(=Rl7=@z*zNSJ#L{jqH8+MZ$c+(HbI9Cz2k0nG zYFk}O+Nfh_ZxNXy^LB14_fCB)Q%bqE@@|saCt{HKhDZ5JV4mHFddFL)NvC+E-)%BF z%#L^*42*h$GgkCz8IGuKHwev`+R=fQ=N;*|=qp)^reEq)m|8|zU~KuB%HEBi}_l6cGu5DbK- zH*VoTKd<3JO8N!+n@J28Q%)e3;VxlTE}mWh?UHJEr;6^ul24uEa)g1Ng9}+!ms4sg zs8@6J4gt<{kH)7K)5Btus))y!7v((|cd2P#QS{K36AkV3MI6XBHs}cZgntuzQeLoj z5W}^Yp?rbJ1GP!#&1a>{E|&}|_JN6}E0$o#9e%Y^)*))P=`;N03p*Y=;1A4HSkA*D zwvfmY82rf>8;2C%vCg+O+1@$&oK(`o=9{Up zv&t*UJBu;xR;+Z$4uaRl{y&;l+Y7=Qf0RMMJx3?>tov`ZOtajW#k$)$jJfK}YPMHJ z+nAcdR9J2%`HKWUDHx{OY4Ti5@E{Kw?ct6FG07D8QYBMtzdTz&k%tYQyi(jQboOLf z5t&Ggt6+jS=7&8=bcpT(LvAIFwu2#up2yOw+C_5jB(kQ!hiGHRPW0O=nXI9=f>?yR zT(rvhQJ>J^5`QAX<}cTnR0fYY#L?E&*a55 zuoFA)1`xYS5Ng`#@-?lZS>41YU|wr-xbpbm=iF6zCx*h+vmA`_U@>+8ZuRx9ef36( zbY#+{#LDsF{nQbb&p^`-rjm?ZNN_LAs*;SlJnImSwes}ioPT-M3=-kSSc`L7y&E=~zcl4_; zOdy!WG&`MC-~-Umspai6d1Ne^3hVOjt5Rw=GCQ=TU2;?qJ5#&Z&DpW`@Cj1s|pJBOlh1Ww;VIod}J<1yrs>0BTaR)J^GR zSF^lDn0>Lq+Pn;7CauYD3r6yJY#;alYyv&#`$wOh^M{en!)`6d%kQ4mUK_b$yp}uI z;#5rSGPm9II34M?a$bmeFBAX`xnatKn$Kyb^A6zW9dX*2nwzN zWl;*SX$yc!=OL>&rnfbeuCyj&kM(X)`qc2x46^NcUPn{>1886Itmr34fE;oMG`9CE z8#&pNgSQ^FqJwC`$(r*tJ5X;5S&rXU_pIB=k)wV2U=LO6QTh9$QPhLL_oj;*tCf~7 zD7zrZ3OQPfCAihuZr0hQPcaiJI49DVQH0z&dCZIDoF|vl6)lD1Lm%0s`9h9($3FF! zbn&e1wDM15N;h&edLw08+R9C{OS)B1N)FlnwKPqqy^QZEj@n4mat2YC{OblQZ?(6| zQIE~V{0d)v#Kak6K%jj#auLF2crEcQr4!f(K77HYb;Ees4_G8AYj@G`IwI3ykijOe0dnepRfJvCP|SolUfx%CaGL z-Hvi;yOZV!Hv!+ZRT&M8UQfrw(P zPpGN2z_DvFeL*5*%JYyZvhDNRX9G1goz=yZj$+uNLA9_mpT?OyESY!OGxKB|)ZHdp zS`tNaV%%C)*^U%vHBR}n>rM{>%io&c)}Wp8g5XXhvQSmzWpqp2-wPi=1_TD{3o z6Zjb$gIN+ubtHulwgKo{t#uIU_J7%C7@z@^Z9v@Z_r-I>A2EvOKBtP9Nw)3|J%zDo zqqy5Ne=~Mr#Y1s%2rM=py_brZ5126djyd+KQvU#~&fmhcwZ_$!rPqN+%^QNgoy8_L z^O|5dJJ{f!p+;&|IabI!f7+-lsL8*TAB{-h;e5QGQJR@iSYcQ2sP>h7^!623Cc-Hs z?QygYz3MqpWF>m~af-3KoVwFacW{6D)GY$VBWI1-h{i`Cw>hc2gfl|Fx@?Y)JNneE zwWS;LoM3ZLQIs#UaOepX-k_0Q`dN!4KPrz@1#|f3tR(Y!LKl*4jmRwI)qAUjPzO04 z-6=lO(Udm|-e0KW^P*az#g-dm)T&JCB*u29%)|1bb{j)3w%f<^xCa>K60)900OIba>oqFn?CO( z^4$89c_5i%ZMKrbVESo0vnJk$ip_x`iqb`NSCxn>&m+GTO4b$iEG$qr<>!T|=;;Src~WR?bFhTd{9a3FJ#z zS}C&ME_Y|GXUR3W#$kQ`0PHZWtCY2o1Zi(6kc>9bxUr^)?YyA~hTM57RmvdTPq`E+ zq-V%rwX~B)rC@N~Nyw;~%3(}-BZ|_Mrjis$P$G^9Uca4Z-%j?IMhRmKR~>WQR?&AM zs}n)BBw2f}J9EuPbtH4S@)Kh?`PaBK+e!C1C2$XF(DDJ=!3aNian_~i!7Gs2!qB#F z5lysc#mH>&^rl%YoD5z!^PdHma$tYE%~+AVxZ5$4y}oL$Jhzzb2R@mq9m@!0bd`uq zw^SJbRB62lKPV&ARhX5xk&nDFx4wH+9!$w7X`zr2$1BpaZ4F$K7}zsKJh95Ne9F;+ ztr_(CRsBh#@-2jFQYBu^yC=UjT6BkEG(pwJ-_0ROCAU~n427H(0+!`ji0>S549p|P zIW=M;W{Yaawn3KO&yO&5&QDs-!I_MhW>h#Uj#jQE=GBeLZ5*lsyplVbB)bVR_B{9z z9hemyZ~W?uG!etQdcHe!%?WcGJG_pHs(#_gW+&3E%cd>W)Db}_3hBFoK3}b8%V()A zT+FzXGBjRNg=}(pKY^!1ZlPZ;{{S-QZgEOsh)RQQ;Ju0KQc0U=3!R2d+$cH6u@$dP z0>$fFc1ZAvnD&=HH!F&;70QUwT45@Ajvg!sm$&}_uBaxVaT`a< z90@#7sYP=blXmWU9{&J^bICiJt5{;@RbsOFS@}+736Lu~dDR4K96Mb{%z4WXTA5cm zsknIJpJDleF5SzH)lzwW)hyE_Z2)HDEuK9qXvCQpj%Q^q>+*4cFi7{SvQGhvY)g-u zpa+kwX6e@Pc}XqR%aR?Ij{7T#N~s~m(KjGyoK zHC#h=8-|fKfD2{VeBG*lGFhbD&Iu!_;*$i4ytI>WXKkptOn)z>RkmX*N)O@ms{&_5 zRS6i!R~(Wmfs@O+5z3?{-hr@5^*^Oj*iC9aI9)DzWeK0+JQ43wviUNqKLNh!>Ibzt zV=20VJBXuH3-iX@w_jhSXh{|=pKY@?_{oftFnwuW#iG1SvfLq&a>_7{ds9W3o(tWw zY>+c5u=?>#X&M;g4mJb-00PBKlOUIJh9SV+xb!~M?_gHB+xT-&)1Sc-U+Qv0GC^?h zLllEMvY~=!_)g)FN58FbUvK+BxSj}K6X$ZTT%LKwU$VQBIBxXmjE$w;uIVBNEFF0a zzLk`lt)`iz^O9CfIEd`Zk5TJGo3)60-400J>fvCFVrD03KgGsQDK8?rv69Xwj}jSv zPs+DqHacdomlrK8ahMg5%9%Oa*8{(5JV7*9`ks#jb6f878csRe`@O&%(Gi%@r14E7 zO9z(7cRA|3`}@?_Q!r8HFa-V89qM_tSG!k*kr5=?#~n!>zpXY1AI}cxUpe`9=Q-g> z{c3jtjoY@!*El1Snw3a4_cOrBfvGTWn=}B`w{h&gzgix3s z-oErSA~u~2tN<#zatn9$0-=zCftQ2zrML3sxrHR!5J(x^`g&FEM!_s~Q)rSa#vBBZ zdHG4)9F zR;-dqsOeTx09NOCh!@6N4Um6N%Cw3EW?XI-=@Z7$%#Ib~m2tIuXWaIv8J%NWb%=>n z7A?P?4sbsT(*@`IZ1*w-yqaED_#BQydt$3i36dyZcH5n}lLNW$RdLvl?SgqB^JG}q zH_IVBoc5y;wCf>@=N~b}I@$ucMOBtX94xAKAWN__jiL>%+%DwGNKpr{xi0P@@uT9%R+!mhw1 z#?syEGf3)LlGSv zP~1N682srgmh4XqQM^UxLhQT(qqREX*<{FyNbnCOK9yQ2nnWWcXYS*zIEF;sD>g@e zS~g`}a)|&8u46lqx;0lSQHWtv#VP@SLWQc0bj-VsLg#`UrIp1 z6Fb*_e&89Wlv$%M&zw4jJU699tjfK`5Sxiu!GZil4n<3HB8J{L-XMgi8Qp`Ay-S+f3zVrZn(~Hi$S9%^-^kxyc#pQL9Z9s3aIhT#@{$7Jy3tS9b)CPg7I;(lUq10)S@teRB#xbFV|dhnFhx+1vAJglXlEnPRIONEMpO((R|EokWB&lJOM3SAw$a=?z!<{$jr?6Z*Eul; zM~#j`u~k0RagEV3m7UGGbVc$_>~d~N6ZcWB3CFptIhG-2n^}b3VGN{kw-si}HGAuO zW|f2v>KQ{~l1Sq)O==QjD%_XIQuU{McV> zV_)&8Z!f&pWQ-p#)b(nobE&-l0J}kmvz~gf^)+A3Yzk=%wdQftiql{CmBJ=&FJ)id zu0PbT_mBHVsadGBw_QTzLlS?{VsGPa?0?x6Qc2~WXPQ@Lkst=niB?SYSQj6b+Vo8x*6QO!hUUuF83BMgAa(iM)7rAm zzpAq&5NQybjm2IwdY>=kD_F`VQcUI2`%>{0s9X7on%#hoX_L!cjnos2b;W07tCb`Y zryLyDbK)I%S?IHQ?HMvI(-~}$UVR${7^Nn<6&u*Kobg>Ix^r)=l0NP91OEUa;<<=h zGe*DObsw#CTH9vNUOt+XL;nCBtX}4eW1Zc_A#Q3g*;-B2M2{w#O*FSOb*2TYOF3_2 zxp;PyZy%b=oD#jrs7D}CYkhHHH<@oH$7Lu;AHsTykAHk(kjI<~LTL+A3sM;Pc2)#3oDh2&o*RaZw=Y_IpY2O_ zn19FYOZ&wi^4Wj9KiU;RUZJ9sI$KNCWg}=Yu=(R^uXW+{b@S}yQ4#7HzU^bjJH+9^ z!5!;v!%`<0>N#ZftmKUoAhb#BttWyz%K_#($+*67N8wS&rrFPPw#w=yxew-n_msDK zmsZnOWqa#I!7KA3io>(LlFH^ekm1ukN4I*$MWdiiG9AX^X9n_L-N@PCSgE_g)Is$?CYpOn?R3!7`*9`f?-V*5-t2o_IbGt=u`FNtAK?9Ckjz?R!*9kY+| zsByUOW_j_pxxI!7BJ$Ea=M4P-C#_XB*D+1JV9EwJ6Ttd<*H!(N+FtuI;ml`}Ivnyp zonzb}mgTd6RArR$j%i93Lt3)3MBzbp82)uk$`)O-sM@$W=m_+!NMr^t-k<{qmFg;- z@T9RuR7l89bJHC9R4F3t5lta>CJ`E;+l|}5EY-wVC3M{@JcI9Ga1B%YEzj9Mbb~7= z%vk)zO;2NGaUI6vG-4g!IADK4Q7c~J*P&MG>TLOOkX1M&4z(K%=&Ilo-Ayi_(?fio zTM&#l0v4c>Mv>d9^uPjyM7Pvqug}YL6%sNovYrmH&uN%Jb2Cmzi zXu*8okD0U3)NbpNvVGCtjEvT0+kBj#lcr5Ht~a5sWHT04b!1_c1Z08gMMZd_M1mm{ zpDyGfO5&rlAb{B~kQ9bI91aK4t3%NZ``wGkG!kME*= z(#IQsKaFYWTAS+De`~l!?$qu$G>`Ew6lul;_`2`wX%S_Bkr9EVfxda-rnBQ z-q-By_FHr^5?|+$zl4g(sJ)S<#nsbpZZz5NBvTf~4TVyKsrLF+E1fz^(HyYKj;S8g z5q?PZ6}6|kK9(nWOJN=+hk3)uKI=E9AI`Hat^E7RE{ge9(3M+AG&z+jPMeLnOUA*C!#sX8O)L*?%8~x{6fw_VdIPh;DwG67S3aZA^GPtV8DVVeDmERE zl|~feXYU^5VjFT=#-;wtG!28rv!-AJC6VAhK%i}H6Ca7AB zTS-wCUN#O+c)`y~Y?F_($>v}MT#!5UoC(L#~ zV~%>$C4%niXys`X$m`|c^iX|0xuU~Nsdo?~U=m@zaUzZi=BwODE!WLhlzFYR4`4dh zmA0Eax@_KM{1U?m`O-g@NV(;QsP(Mbs(r218>J&BSubO-xQ#msln zZ4w|zura)!U!_=mQawJx+7S$mJW?3tk;&L_JL0l(kvqt)zNMI$V$TH2BrpiVNsw?h;lI7jS<{TrY5KLT#AR7+w+HRie>&$Q z5XW!@!!YfL$jbZITcs#wmJ@Rv(8+3imJRa})a8A@ty|Gq4|6inRcE!5Qc_BJJt3%OT0KDBmPnrNa9pid)^Pp`k_RBf4=FdLL$at|%v zAEj@@7hI7dV+=ElRMzIk!sga9%g8m`9tMg8dte4$Va>L?pJayZOO(@1cQz$)Um^;&1G$G9_gi(kxOyd{{UL7 zWm5`oNW(uQbUF1Ed&7kyzmn(eEQTGS1LF)^Ju|@i)JhlJ(lSWWm&2D9UK)T(vqwML z513()1Y$=(4sKdO#L`3HX+rne1sGrvlEoyY&<2fgFc;}mZtbot8aqkXgym1DHEPd7UCXgul|b2f}Ws*2WR@Jf>md4;3iPFsxn)ttgi`%7jfCVoyt+dL`xAMu`K@pttQ!IQRRRNkC<+&e`)(P!IvBU z?s3pm%Qd|A@j&>9u1dP6E%h~NvSkFyQQA)7q}op3y;nTARy9*Sv`4#5F5mH{+&GC< zg6<46*R4}_-E-u>lw#qI7L8dpNV9DeDwhoRt0aWtB(_rf&$&GXM{@=vSk41CVb+Ul zXX^_lcXCPED%;0u_bcYUB2YnKdQ#Y{^fRGfDfVsP_B9HpnHvmo&q}c%XySR5(`xWH zedATya`5uFW)*5iY_+9A@y!T0ZND)lx<1F3Mx(E}u6oes)ZUcrQdNTu#-&aQEe4VO*@twU%o-(@T8K} zyq_{ssKorLPB^J&zqN|pBM^=1KLPkuPpJK^k=Y{~1{4L_1y!Fz2CP@LGb*@3+gpYp z;;tsCG;9K|nvR=uo=sMs<`#@3B*>-8m5Y*U=k8z1MiRjcX~=|yCmyA`Q1;kq*;$B? z`LbmIUrrorrRw0tkOSr_T z7XC89daX5NDHtwUvTz4ljfLf{&?Q;p9YVJU@T&q+l8rZ(izz9 zxm0fBt}3vB5sIMN$I_wAz;_S9A1-#_)mVhJ)VVs#3?6(bxl3(UAgDi`Ld=K#Y{h-T z=B?XVw5l57Ok{kq7Wt3s#Ya85qaFNXj8xxSF*KVSBrqf>$t&+YK3r5JzV2P|>DrXY zf{&^;~fB|fsnhbWxD+ZRggT5m0+vyR$4O5O+sy1f%4>KBd?`eYpD`f zEk_H#+{fomct{w0=As8G0Blpd-Q!@XqDpo8~aGtD*^2{8bl;at3=~iMsWChg- zZ0!NpgHEyXt^CJ`$gKPj=lauQ2@=1S+qaO%j1XzIO^UV&FZPs=Ix|OtFz1h;6;2o= zit*44?PJLF?N{zf${4F3m-ut@3KMqC;1xf_7;{=*LRln^?HEutwmP0o5O0bfC7UO? z*@IQv_uq3Nm|(W%X`+~T2uZPVmd`G44{WRAs9 zNU*$j2Po$omB;(tL8zp-j!9t&3Mq|nHVg*(^H5dAXKXou6vFM}lE$+yEhf9v9&M_R zk%uyM$4b%8Bo7Qxg_L~E80YfoQ3$1)QL#W_mN_`ctzz0CtCf=7(Xl*{XVo%)T6NS4 zkw)=>BKMRxL+Of!c@xZZWSa!-D90Ot{_biIwk5O=YxaH8?*no4s??WcR=V7hqp)zl z-t0|26nRyd5U^2}IN*LYp$NBWBx!A#QPg5H7BZ)TYQWMH%P`2&2wMau{h`0!s^>{q zo<_=+GstFLl8xEF+^jN8WJ35I z6rO^T?&;=oj80YZS^DOdOPE>|EpfU`?RD~wRC@c0(P&G32v3$SVp(!u_(l&j*>B}H zQk|i5z}v?ab^)6vMMp65 zUL}lgOl@H{GTyjvN^B6yT~bKdfZ!bRzvEiVBytQ)B5Y+*SRKl1CXKD3fi%VoiIB${ zfsZVP-HtmNeWkotlPos}%Zw1;dv0lDfi5o}jj12**Vok4$z=;%8J)+KAKz9S`(mlP z2}bOSZ8j*H;YLcP9T?@lUi4bY8nj2u+x_r=rD!bs=K>Vk6Cb)QxO45>@T}{YEzu)K z0pxZ9HAf)QTJATYr*g;a$%Q9BbYqj6g2T&b7~HeRBDUWzW`7*igoIRL83OPyaz8p_ zvoK=q_#UQ}t}CEr`2wf+_U;GniYBm`gF3Wr8AS|a8O}~ReialErNzlHJaG?{V+SnV z{{Tw29o5g1C6vJ>x7ryWW-`08gXnmz60sK{8b#!gd6sdbl2^AY{riOBjI&$j@!(&Ro#JGm5Uj`7VDd)-Fz%)L0pHrdsV_Zu)V>r17{ zZQOL1GDC97E=*+cvHt);T6$%qT8wPc%*%5myBuSUyGb9Y#biki=1B9T8*t!JC!peuJLq*CLsq!8B^4DEL{tI?9b4R*&1hq`@eQ1^GtC;TgrlB8RSWoa zsr>1|&L1t%jxuV5^2sKl=N=PgAJk{p(v&Y^or|)00_is4QbIWEw;9btCF2RZRTe)NP=w**&G7+HJ(iJ-KoU z;z>x{bDlq1SVx+X$@XAl4V}G1{wAGmb!`@H-4G!{h@n35EDr4ZR?YlYrNUb*fh@C(MWd0q*btBW#rM`(?60+f9 z9FLdp9R9T&uWXEw!wQIkZ!sR5v5nY0=uIX~iQ~7?t#3-+W_bc`^HE6iBw^F_s#EFm z*vl-gQ+!Xb+lR=;J2$OH_U~~I+FfKUn?T#f+O=v7pS_ZEwjq@q63^;=g*Pt6BvEOV zq+}^1QiJnHBH*nopS)FHnY-XND#hf2<$rrFTz%7yYODzva*scjTZG&W8Na)qS~-QU zLH4cXGY{fpoO)8s%OeFK@(&rQrdx=_QOKcoWmoa4lSm_%=WW>TDm4=zD5gOYA!g1C zKLa&Y=PKn`k6t^D#Hqji6t+L2i1b$Uc#@1(e z8s|T}Ic)NA{57X6XweD9#B9t7^7fp94^PUgnSvZB$j)h8CBCI?KhJsF?klc#;j^52 z)%&}VBE-8kk@t^W)_TdXu^HW*;~lflG)r>X7{E>6cAd2q?5s@lDlYaT@v3A>*_#YF zECxFYu^Rd9wbV3u;~m9Wl@Z|^P9x62&rfQ52H|R(hn+s)j)!RrpPROQDn*T0F?g9m zo<4512$aPLjaax+I3CpTbsVsr%wswHscN<~jjU)0kyany?c*VRkD;pbx`mG$#?)fK zoZ}|0S_tH~@>LEOCok5su4YK)e4J;14Q8aNq!}%Hall)XBqF?G?DM}$r;=~FLo}{H&dv_-J&i`E2iFyLNU?`JWR(DR#b-zv8Jnpb;(;>BLRc{R z(4dBF51$$5fM|+;-3BMGN`CO%cj-V4iAcwpfY|46N|f#Z)JyYcOJWZE@n7=%@B}9iKI=VZbNk$&px#$5*@_z zN0vHgy$Q4oPu%U?xm@-%*9vkKvMHF$VCMpytLD2R^T%o~D~?4?!`7j?CK%pBfK%rA z#YJ=oj|$scKT6J)Qn5)JAS}e>@=2{B9-!Warll~CN$~u`jm71_9X@~lwPouLsjAP$ zRGfk;-mAI1FQPgr!la6QzGGJWbEdE!*^_Kg!CcfA7AIW29| zBug1^&He10)tMEo6s+kW0k*Jg8q~AXuP?5c+vjr z%fPBbA}Ro@Ha3YUYpb7@(jUA>S3Q+}wCg(tyO@pJBURlbhoUd7Nqul9D+~%!)-EDX z{h0fjU{|`ilJelp_)+&!2SicO3XJhj3G0dq&>}vyDy=yqsX|k@5R$<0Q))VW?w--4 z@$7kG^Au-4De0d|n$e<$@)>4`%Mv(dKyX+N)oW9|eNR%jymi`SKPvwKb$bd|1CjpN z@rqc-Wg2;L#MxJf?0?091l1!SiR5C&K2KcCfB35GK*cB-H1-3O7M>`Q&+KnBNorej zCh$r8dj7T0+uKYO7izjiIY+G*6@;{eo6BMU08Nws074bfYQuG(h=1%O8LXW>H|my) z>oR{j*uVL#HS_DJC-bb^)VpYNB0pMbTAvisD(+2)3VQUU0|uodG|Dam7r^%|j_8Uds8m0tQdGy;(umG_8%h`oxG-(ql-tH7|(AlW&nN_Z@#5#zJL-57KiMeHu4eh|@so!YOYLKEZ{7oUCY>w%ipDa*{_t+A{&80Idrz`iGg_cmXCHIDRkwZqQIMSD z4ZAhXIC7MkUP-*baT6%^>58iHDy!y3Bm3Co)%J*)wix#!gSbcX@mXZMZX1ymvw+=k z>Gh&6>cmMDOBQ3da_wEpOLaNvT3S84Q_ib3)1)PLGh-b0A5PzmK(VE`$w2!^>N)%> z%q;|$W!?sDtGxi^2KV-@Q8p-~DL2|vd{IanBPVFU>?-7TZqNkA1~7M?wJ3%qm(B9y z-yrMG0Qyu?OfmpMfD^&#P}_SI%B(il74w@X_k*XPsl;1SLBZTQH+pZLLK3Dm&wNx( zGrNT^zdcz007{)fSaeAOF==7Q`@|kc6zhw)FBGiS@$HO!fk(^r$9k#ulNH9~kM4rF zp~K4);CW^FXYbaj1Cze0{zAF8+bHsd$bPYQd+E3p?I^fi|7Sl?>$ITeWnC#a%XQ^XVt-(FXF>S78AcJRwNBhZ zRxdJ5rB5-harCV_y?z9|dsyL(m^YTtmMW_rG0OV$`c`x1Pu)X}t*KrME5ETiwj*@= zzcvZ~0Cu5w`#tTCp3-@4e6q{i2fYUN!pibX*148Ipor~PDyrMKKh~1s<@HTF&k-i? zJhn##N#`K;tBY@R8WY@k*LI=AQY*&&IbF(qy{a-6J9q~h!3SaB`c>&=D6&YVVJO3> zRtx~(WBQtCg4R}hi(aIvydx!z3s@dzGN^mCF6#>e;pT1V= zzwK2BW2|$s2S&;4I-hEnP!6iH#@QoZUdBF9iTi&JJ(6@Ac2tei^^?CK@r7&Hr z%O9HqJF92Q$3INfwx8@4o^1P}nt2FVwnc;C)d@pfu`h~hBoNI3?$+kd#?gz|0 zsw?YhHEX+Tq}Ypb8q86+AMIkN@WeNkdW`mxhmIAFSvK%P0ywI^gpQ6;sa?L}@H8_j z%OCeJPD%E_suz>*6Q`dLH&Y3BxA$?Kzo@I9YWN-qGrzCZ%*=ie)v(lE|OPce~k{0unt1$i8?SguY=B{g(Q(DPj zj*eoDu`x+8oQ|I0isz)cxtmRv8-_B0F7St;V}V^JyAzE;UQhL2TQe@q?qQ5C*P6+% zdhE5ek(q6!S=(vi>A-h|_9{s()M(r8o;~YM3pNlt`Em%}R$sG7JB8ZhD{gdeoY^)e}!&Bg~V`IR$^^gngBMzO@a$YX*r4EGms<3~2IO zH}49Q&hJ*Gv$ILG(Q<#)B%{sCatk*>Rcv9qmTXQLcBc6V`MzIT(Af+KmT25azHuY? z$Q)CX=u1l1Ads`dWESzRTNx*iF;?_#J=w1$kxESzLh3E8{{SrGAh;r+j?P<%A($jF zqHg&&{pRHUc(6Gw4 z>T3#kL^n>!Au_VA{Bk2+=C5jSC6$B64d>b-hFr!nRJxE3GwgF%klcAKEO#p!Ex^O> zbHE)j$6A>#XQ6SEMav6pLvZlz+GSYRoV0*8PagQHw(-k8u@%fRl#V$;-L#SjG->NI z-5}E^jc!nGZ1y6 zN6dbe1@uZ)B30dSB*>WaGOy}KrB{mh+oL78ka@NqN}agI<^`MasYFj{JQ3U7B--Sy%U&+lp+OJgpMW>fbAIhZsJ!dfI!LbbHB~L*^No zM*jPl8fr~-1>J_96hc#PDOm(xHdUJ@nyD+;mr5^DH9NPGK^_pt65$pSaHnoRI)cnv z+E~Lbq|>aeA=rR?&FRObXf>3hA)-_c%=riAQJSTxEyUV{7h}sRu_;7<;m16mT3S5{ z+`7Ay;|L=Ag`8;^X*m-_MGM>dVX3 z7jsD-@82UQTz4FCPBu!$QjWTe)GYr1v+b?zzk156^E{nuD;XpxRzeCdPSt9Ai=9g7 zM+%k=7-k>c0CSp&qFXc(Im30>I@Yk;QY5VIied$PJIB0}o_ZS8oF(P2ofC;ARKi=s z2@4qhwH1kbiEK3|j` zIjG{dktHtH*75`Va!f`)0Zh~*Gf3A2laBQLHb4^+5YO1wi))!Z%U11)8H$7~Gldxq zPJOB7Fv(-k4%EAN><42}`H{wwA1{;0V%02mAbZPM_aWU?ZP2KfN|D^5*A&&{p0EVfF!!nqr=7OL$GtCa+GCYbCO8#oyC zscPj%Ley;W2)vtFz+gUk06f&Kb22-C>|4HTI7(w5c$3%~zi%9u5F3d$N6rF|PV}V` zwQoX8E2Ox%TWMNRo`Dl3t5q!|ve68(BC9mND;{ei4MB9aWfH~ne5fw&H}qU>|IUvr(*t?q1LS8Srm$Rmy_t=ysXO9I5cT1@Ua@7}ca zmy&HSV#e|WZ0=ScFX33Rm6uJ3Y{Ek;ZaMX?h^wO~dz(6Lw{>#>x0NGE;~yuMxUCkK zcwkUD+6!UH+xTOO=Io_eZ&dBrzjZ}wUMx0pGquX7QaI#!pGwZ874$9-LiL$hCNPlP z46Z+xYD>5!jX}o3e-l~gUIxk*DI)L(V^(zujEQd{XA1HC+~T0CktC7Hi2UU;ZowP5 ztB-14PE@*+v=LC*X_lagq+Ccj8MqFAL06-e+BnM*J6s|C-p0bO)|8s)5k+|-`#D(f z7u;tbtyzqqc1Amtaga9C(*9<3iS45(er&sRs8)1$BreiUOJ|S5v*>87P38GgWNj+S zMo7v0DXSjqH-V%G2h0`Q2Q?PyM307jtIr*U6G|iz#TU;b`YAO^}x*c-3HqfvMfXIfPn7&Yfg0vpdkm}G^))as5#(MLUt*>)xl>6 zJt+jEkU2i|l8-R~h-Z$~0>aL%^8Wz!=?uQdbQ>eMF}n_SjMP!C#Q{o}9dq8F5D{Ad z*$xK+ql6H|=NY8khFN%+N9Ga1sQjdm1Ic0AG>UpecBQN=b03oMC&Ghik^icb}E=* z&T&+sM|CbGjxiry?Tr2vV%}tuK%QV@*MXeV3lbZW>QqT15zrIBsGdnP#?t*IZg)B9 zAtxWOgy&GXa+8L2E-Y*}08&m%o)%+Dijal(!ca78Ro2-#bA9;|6- z2@tF;A(m~e8nlOo;GcS)IMD7xg~#1yIiUvED}?a8|y@G_J-RmPF+TFhicQ{XPF;AB7M-va&zzLK&(w~DG`jwurBSn z*jRlkE31hb9H=uEJ94L*Y*NJYEwoNNv^?!{%znQ0Go(uh40F#)EvOO2YjYEME@Usa z?tS0VpMQ7uC{iYZNiO960=u?~b4AwY+5}`fwj35e-Kt9OQ8!|& zdaRQ?hU(rvfGla03HVj%V_0EHAyO5{Bqsy#u3Q!nAN%T~R2ENC8%Q2aJkG?Fl+Fip;T4={C ztsda(>$lVCR$y$DUn?#-#yP9kHoj=K)L>OjoUCea0Po*}O|{gNp6I1{P<+FO zL&EzSqI{^Xm*=SbtjYbiU>{YveGsr5+k;0>~&g1H9GUHT9?nH2-4ZLq70jkZF!`tsC zjl`aDSn=5Hj+({tXfMj#mwL+of^GswTjB?(cY0RlL($12jY*Dyp?vgrU zhORU^@qENp`LYfagUXEZMN^(3_Sw+}*ylU94!G%@`qjzeeXQu&9~Jd&X(}EnKY*nf;_|OeZA;uvbGy~9D|2uSunsiJ8*x7n-pSs56A`F|fCCQ#b(nq*6 z@w2z3LePHdP=j>977nIpZx-t)HgBP1W2@uZfbeq?scS>gdk zRkA%l8lFM~M1i#XbTu0tzjX6WlOJc4sb_5RbHedX#M?my;Zx;|{Jw{^NViTM4)c`V zf;r-`(kC&eJ9yfCeSZ!r)KSjTM$zGm{nB_Ct4Rr5)`wAZ0Y7~ehw$>b_w7%(xRCtK z_t&wk^hfd^%~m_UZIQm-fBMyFRp(U%ZN@X2mXV^q!@kiY;Eo4!dJ0sL0vs`Ns(Swb z2&*p=qWK`;o;dGR8qIeW$yQ_YaZ{kAnB{Hg5hXZ3Ha%)_ZW>kHyKi19x7j5|%s{Xl z;MK)xiblqt$E=sBxV7@3aQ2KEGC^TkY2ArfbH?0Vvs zu_+`F&m%^IZUf|%DS~R0o?A)t$0ILKlW!lLXtDESM)J_Au08YXS*XGHs_yxF6(Xt@ z+->v}+l5m0k_n(6#JyF!L*;((jP(^}Dc0l6d<60%!8rTh{d&x4lsum` z93y~1#!pfyl1yVwkIJY34ECViMoK&%I-a~@p+0QWe2AoQt| zsFq}G$gZT2c|3}@Z6q<;d2g~rZ{f#Et+v^wm3*zemtx>4_n~e~ijc_e?5qF-paU4G zo>tC1YX1P)qLw8Qk-upqo`)5bCU2Z=9^SP}Qg$Xs-H=}dbj?cf?~8DM#-mhORGqs` zMHJ4Gvgd=r9jcHvK0U$D&5z|%tqHMC0ijNMlo!L1(MIuSSJh9*&l$*$2dQ@Uh z-Xat7?fq#BbF>}8k{!wS`J4_aYjgp+oYrw-8Axmjy#l()mLuy!Lt2g2EuIn@E=%i-Szsyo^Lx8{r#B(@vNEMZg#}1q&Kx`Phw@cud2lX z_*JFf#~OzFOp%u!o1c%&Qh0=oQ^B4dbzo$S@-xl~VE(Eq>_20Z;iNhfOos9sgD8xS zqkey>toU@-^1>aeia4z~k|nE{R`Fe|Cfqxiw{F|pHL<1W*Ei44$rAndHMOSbk8nKF ze5g~bjJceCoa8v_i>Cyr}Ou# zcL&V*BH|!=delN4qorttVxAaM!*4Ck!b1|0?r;TJF_t?($@Hx`BV8iY!o1$hzr11p z04&%0zgl+&P`z}whxb>MUurHI7c_|TtXY58WvlliI##*w ze5duQ`YPvJv;P3DpU~B*InnjL=k03W`1NMqrKO?IZKjb@;+u|ZLI}3gaZ?eBjj=%V z8SP3q?VoB)^G?X1M!vOW1VYmC$pTwlL>Mk}wTDqzTpZNLP`GOZv1U_lj$KC4kbbog zJW?3KGfXRrl$?9h8`hBS0C%T>QEuj*8k0borD1Z3y4vIBx0COGvVXiPJDXRuxG4OL zWf*i}>s13*ot*LBTZx2o45le!Q^Y*^|nSZr;w`gUr{Q0i!=HsZ%f zwSBi2GOKzY{{U4-XC{Sfe+045KJhA?j-7k^R?XeLwx?%_R}K$O>T8j^(`B=nl&kq1@^&^=k{gTJL0`$$X(v!&aY&NRL%nHva&uiBA>hOw9uPi-zrwTI{uN5n0~c z*{n?M`=E^5e(PX=T5*-$pw)`E*7ILmGTSQoi-!fe3a2~~w7z350q3oBS|n{X#m+(7 z_I5n|MPOY%Az*m#*A%A>Z_cZ=XBEQTxD|__EwFzf6FNj;0WXzmrj@MdW2C%-b!O*6ZdxY zB-13fYk-OvwoV(&Tw<8fJV<<$L$qTity;!SJCfYL`b^4_`GFg1$T9;DY}6ArcAdlN z>rpbw@~qC^d!D(crL-!`xn@d8M#iRYoos19xFkCi4rrazkK&$rQy^dxFt| z$h?&w){zS|ZH@-)eQ0BeVi@L;^C-fOOETjXXc%M>s3Sh4V;oflyl|-+jP3iXD$THT zhncwanNdEvLlPW0B`%TS*N=t80`3QX~t<}+*;yI=yJimD#A16O8Xif7sn35K9 z`&5qAO70UQLhJ*&^(1xtD&o+kpv}KFSO5}{7c9F-+Lr{>;=oOTEGr)d|k?vg2lN{r)Wyh<>5$IbleTYEd* zQf!m~U2?-1RUgu$)U4st^ocE^vtK$l{q4r!8Tsl5`Nd7utXjFH9J+POGr=f@wh^(o z#^2{sqnAgs6Y|AX)lXhCoc0yT3#XFdrGX4-{7L*I`qs9VMZ|Y4zhr}Iqqp4W_*$Ef znC@A+w}u;pRCPvN`MWve@uz)-Bv+aVMg- zy*?aiyCu3*k%CCg)mI(QbCK4Qmnt7E%RdN80_TwLSk5BQc`hDZHx1-A|`>!wp-7s^HrBZE^qAgn7Nj=Av z7z_85f_DBC>kEdpzJ_PEF46td)dmmUs8%mBSY!KCf=iHJ8wLO)jkNe=aQ6E#Czr&- z9lCC&tn>{;lHPO-;$U6SgS2BD^c1t8Xu>RNWK>P;epAQ4)YLYnX)N8#HNjT{%g1kD zUbS(<7G;(bmH7eA+HC@3Wo_3>AaH|Xt1dDz-|?v~E)ngvxsFlhh5OqXeK2YXdm>T}u(Pb3*GM1xC^v8U7*Z zP)iD^f$uIzEhs383E%_I(=}!*yO`t&c@Li$kNsvw;K#Opl}Rq5yCS-dX0eS~_bAQ^ z^uawUt-SYocB`k&Z0s#!IFKs!&)z5StGAY>Auge(mbzkt5GN0T)a~y~No@5Nj^xiyUIj-Zk7?a-0sd<7EoZslby+%*BX9!jh}F0Z!4A?6>-yvPKv~YSA=- zzFoj86(9G|scjC-ZJnl>dEt~x`<;yUZ#t95%y=J=t9I8`+HSXRYip_frOPWbGT^TV zJDaDaE#;ES;d>>qnkep>5@&>=AqU?4dea7z43f39)_!D97?9k809XF${^+ddHGRs8 zeF&w{jC$1fLgv(3wZLYHY>|o>=o$8Z#G$t+f}O;akZ_A)EbR$>a{gx@okhxUt)Qu)OkJVId8;1wVJ+ zpf%5r3q@}%2J-pkwYJ6-vbf92;euNo2Rv{?zZa=l#;8TxK!Qnx@b@ zM>Mu$GjOf{08C?6?UTuGYMWDoQM}jgWrq=I>@b-a2%vTRvsAC7iquUtqlO6LIAkhE zn0;~SS9JCzuH)vpnKeP@+fO>K-NM;;kJ$RxN2k7&kh^OZ^Td-l@+Dj_QV!r5eR#!I zis$#UOEd!3NknfnY;eV~+tRBfQ(WohXIDs>_gkOgVUxS@83LnTq`MEhaeBUC{np?} zEV&B2^NOc+XEZGXjrS4lA1i#Jb63nzK{F~ml0@HhGVVC^;;Bh_bMwhNw7))79)NmO zsXl>iH7LEWm#CRiNt0(qR=~%-Ypvv8Ws)GP^DuR8a(JqC@JQ(^IVgk{!uIBzV0E`& zEU8Vwanl^t%U#5siR|tfCRLd3i-jZGy-4XCmUF`HMWacAna2R}#YV7sQ7X7bX&B%U z%|Q3UF$fru_veA~`r@*-w?!%2a)q=h4cD5H26bZ520e-P{{Uw+`-oEVYj~_)aVu|E zc=9&#NY853hgFL5VxgjdvXEmWC~R}b9chyZ;PNK61zTZJ8s{Yc0C@V-a*<6%>PKT{ z`fZXW&F7fU)r$PxdJ1*)FtpN1VX+z~1I#amW$rfBd#jnAYiD=)kqVrSrl-0V6IjBM z!qVKU?r$xM{J++f+IMB$JrOmlgK9QO6ofdHldl|upU$ASTbSodPd7ikXN0+sL0G#!# zlI3MQ;Coh#jIR#D#kV%%$GuZ!Qu40SHjkA0n%XSnvl%(vVSb|&Bj!>m`gVP*iU4(Iu!Fd9;E|yz}V4SvNZV(RDGljG?gu1g&Plo8kZUNlVF&l{E8L7%njM-91 zI6jp4E<{%FMC1U2I49bqmhRukWkCA?z~`L(Yo1)!DYQ_Omy>@eM;OgQ(iOUieBzsd z1b=vXQ>M6&ryp8@t~|AE%6a3Sm9$evP4+G6mZsWQf>`&fHgb6wrs{Uqw)R&E7=XPDHzRTThWrTjtGKzlPi8GLp^HZ~p*Vs&c-=g0Tgzr8cL3Gsza% zSmZ`JDD)I+t8ENO?iX$_H{|2mqf527SKAwv=&K)G)q>w>)1r#{HW~Y*@Qr{{R83yBMW{=56x%oP2~1)uUr@A-VGBm`ft3KtAIC z0F7rg*tmI*oneg1vAed>)t40otUI=r`D4%DVS!Rk@RX6+103+o#c}ykA+Siqb4M() zpPM9*i}dxWzJ-swOI2dfU&nBJq0IZIs%jTkZkf)B4kD#>(>Tjnf4z{0s2#!-bk1t zYzKf!^`*?Z1+Hd^Af0j?x6o50nNAT&BBPoXS@Pxb4se33m=fVZjNylEzcpThaTu}g z+y@@{sRV5jg3dARRa{K0>D6}d0_U1+GVD#VGf5}%rqLy3B=AZv zz^I3n)ujBXSP3_y9kOws%A8{4wt7hohd;Z{KwjL_L;UKC3o_#^xAl)(p_W06B)VIsJ55ZVM~#@j8k67@BPGxM^d?LkIZ72WkVyC!#`{;Ir`P!ra2+Rj}U_i}QFclE6$nGECZMRtk?(Gd+G zBdI+NUBM$gt)iaonP>Ax7c7|PinNx_XA;8%r4y8YH-^V-cOsoIStB;{LJTZb@JI4A z*@82w!i<{$WgzgsN|X(DGiSPiLkL-=l0mtdva7ql%T11BJA??a{{XT&Rf%m;&Irq| z3jWn37XDf>ei(GBx--;#Q#e&uLI7>N{{TvOxO;n*E67&P)5S0c<2(%K6w8z492HW| zGs)*3wKl$kOuIeoQb{C?yBd(G%G3lCD za`6J#RO%lbeicKw$YoM-46M>I+Mp*aeJb=4^^MyFM}E1i#kNSDU8N3NC3fY1Gg=oh zG*Tfcw{n6%&w5NmzIbK;zT9z>-kfJ8N&r5hpiSVe=|AtFKPpz*kf;u#L;Nkl`ihrv zt;me~rU}k*jCu;R(J)-@6$DEX?j_rJ960s#r=`GYqf@xV+1)PEKt7b7r55AfWlWu^ z*Lm$$SQw!x8ID#t3nWL7Y8|ShPQc;k*~Sh%w7$NWp})={=WG1^G` z%76WJS#2PlNAE~>`@pt8S|I==uGnD1g(TB1>9;811+nv)PBWG&`nRXyHj^ zhBn+$6|xRbPL!8&`LRLg22|LkQF`>_HDb^-ik@dJZywF)ys*xBs3*BIl`g+Jj1%ip z_Z!%(5?7Wb@}vs>+z(p2_QzWVg;5JdyE1kBN!WVz|Z*9yRoYz1~=NsQ-%3Rrg?Fq^m8XpJt=59;U01#URa6rWqLAqZwp>o9T zJ%vy;b#jHQa$LHC506LliQF*sK+^hbQLrsL$?qtM3ST0Z&?VuxwS3ju-~X zIXFG&b`nIg+UNHsSmQStXJDKwb{X`?YL3#<6uQ}QAZCeEhf+r;p#F5rLh@Zj9EF>4 z`4kRKHsPZCMa-zo2K}Qva5$kNtz~lba^$p5Rh3(U7Z~)X%)-?OcHG_ie>%*H?1H2f zW77wkd!IQ-ZVqrZ^OHj-m<`i3ftNgb(|qYxM9c2a_qtP&0h7yOc^Dv7=xiftl!3M- zO{g2GKi8ee7ae`+km5G`Dlp0>SP}pv`c#ZmD&%Ja z(xJOD(y%KbwJOJ!A2un-WBWNzm4W$KGUS0#KJbK~!=Jm}rH3xrc*1TO$Gt;+MnobA zSrh;m=Yh^WGwD+^Z+0$A6T#254sclpclvu&WbOoK3_DW0V965UeBHZGO!3pDCAr-o zc5TYNL8vX`*>IW}k&uW& z;1bcUFtuvTZ?_P;1q;sO^s3S^ig1cPM#7S`U!gu^5;psyu`oFA?M=A52o@xV1bz1= zrIbr=sF>O?VdF9{aqUn~ZmBaVaz=79-l;NJwl4Cl?CFF+K5m&lm0s={?d}_DI7R`7 z%sJ!Mui601l5-kotRDe(w44e?F-`1qNw3^Z=rIInSWz1*Rky1%hbXa}OfSZ6O zqPUWHo+O#%8xG`+oG?#eR*;!2vB?$4jz!vbkC(sWPGAv?Og6bW0MT_fpC_KJo!p$& zYltSeXy5NFd1gDiQes2Ah~#~qVG?cuO7#5dyU4N1Ar8z}BX>|~)_0J_f4PvfcsN3M zKU!OhC;MEo+RS4O(R!8j6)Sx~vG#Ha5pooViFvxf?EJ*9jal@ z9`zc-h|&kmb#YhG^V%8SF$#vZ#{yT=Z`;m_{V?3x_2|`qjZOCDn51x<^1bvalYg>CEQmtVi?rNW{Nq+ z1yG*_1X1xC_VuZ*W-W+e1z}zWTP3~fxzBT3p%9eCF@|bmSG4;`LOu20{CmaxAM??@ zDTr;nDW+O|zGMw>!xr(L-|ZU8a?b4RcRL)hKDBQ0?e8vHX?mW6vX4Pk=mI^Wnp9MT z^Gzq30^>r|fKa4UhvQ5GZ75MsMQ3n`GFadNQr+55WpH4)O_G6lOKt(6gor6et zrUFyDQevD1AP`g6tsq`$!FixHH1r{zZr9n1f8(a7BX*6gf7i%Atw*2%{kLzP!H@ns zQbseO>tFTUAJ(&bmak)<0kh2qt|~_BOg(Ewpd&s_@TFd2IYYZOZn;efEs`0T_Q;aKn^Ad1n7UoZg24cOLhHZC$LUg@%1wp%3$;fM9AvReSFgUxiYOtLA4;A9bw zYbO5yPHQq`izXs^e|E8yI-^w0KMv}`)*J7%xEpOsq3&{eS23?@F~@~o71unPb52zK z+D!Ec;u1q$Mdeb_RcAx1`;G}?!$KVu1*)bv6f+ON=Q+W zhya`%XRRqNQj@s^suovLRA8OF@x?(b#lLh%ec7dX&z#cSwq3H!#N?AzH z^{kkN`F9XAPOX4JALLcLWhU4=XCtP2R4lIW?#>%MKdmfmv0A)J?96ewkLyQ4r((o5 zYb!SAJb!ns7cpB~Tlt1Y1x`8SpT?hH%wxpze1T5gg9IOcT8ddjR(E$LL-{`_Q~lyN zH7et{{{SpPGNb0t-Of!!a`GRT7RE@d8*M`S%-CNqGyIzuB;;bYKr3-5%7h~%=hC!s zXcM`0Eo2!i6ffNk?^TvBAkQPonHo@th<}Gaw!-EoSwL({xVGfa2*cW~Pjz=M%3E!z8(m{S2j+R{RzJ04jaghR)Svx{7GV^bQ$_lb#I=d)!4UNHro39LFuW z*s?ANWct>GZ8Pc?vPU{I$F(HKMqmED5L-=SG|wC%qG=eGJ#o$lA6(O-4#m-+jg-3K zfx+vMPWs#t4W*+00BOxEZy-WQ5xQ;IGy3PXP%(~M| zCy@r?0@~}3FF+rC-G;oG^XvTQ6XqymJ>5wC!dg_Fb{res>(J@ zUyu`!2jNrTMW{{H5tJzyJpTZnYIUvAoux>_#8c&ILgR2f22BxuV9{O+2f0Xt93rlA z=NTTABU{6891NidcB7mRsQ&;OcH4wsG01y{ZK}Q1I)0|9Ufj%)iLHFoG(^NvlkS3k zU@EujC18wPz#xyx?(Gs!HLUm%K@L8-lB`c z?`0*u&BM&jGTu~D?P(bQ0DI9zNn?98v%6d%ea@ma2N=gwikDQn`%}xPNdiV#5ClUz zeE@38T-@3gtzwzIx0Yb~k|+v|w&EZI#&P^!wJwRFT+OIQa=M+n!y2^BFk_r5@HYCI z#Iadk>7L>?c*|r&!|(Cb*31?qC7x3;2aThq&umpqyJ#i3eBLaVR(G#$iDXPHVRsG3 zBX)l=Ry55$Y~;PQlHNv%naf8fa(KvY`53BmOc==wZ0Z~CeB7_&T6XZ=%F|DGvRj55 zBie8Ras6wVyISUni3QARnv8H;T*o_3ihQ6L7}a*F?mPO4qu4(c1a)yaG^R#HS~+ z^Z3_Wt7!!IbbYMJCz8Ta6~@))B=OXWz_ga$&q-#IGb)K$w-1@P<99VR_OEkeq{pT~ zs9eoG;w*-3n?k4j^IuPTnvrt1DAwAWH#(%cUA3sbvN0>Oc>*`{5rG2)wm(YAxV`&+ zt7juQg;Qv9`1{+s`(ShxT3_wktJv&qO~mR`cwaGw-o$b+dUdT~r#-sO=35J?cX0v{ z!+lS^CZkVYp{pXAOM8pbh)lB;lWBL#u3PUnzAJ9iPiZuTcw>@FD5MaE2OzN-`A485 zoL4b_cP^WMJdmUjtXAN(3jjtkHv@70AUfBeSX}DE;ntq^?G3zk@y46><=C?P#Cm<~ zP?gp0XDv23tzJpvxS88Alhw}zA8Ms-YVyFdNh0lSsu4-tbJf0;qpV$xM_HZjq7kf8 z1liY;pb`2~wZhw3026eF<@rY;M*w~Tmo0+&5Z<@})*$D05RiHb&nQ+^*`Jy~yq^8J ztM+qRNeeM|kzQU3b^v#(7X@Z$yp`HxIT=62p1+kRZJ7nsxCtiRq|B$~$;rvA8D&ut zVEp^I{{R}lD@zp7UMWT)v4uQ}eXf}c!!^wNm&)Qe*Mi2BT1ag+GflNyjY1DGmL=L- z%#{xZ(Q8#Mh=yq64d!jy9P&prhViT-8x^LCScU|hRlNcBsNh9MONmF#ARwX~o@!ey zDNyz^UW5n09Ms0>R? zK_!h*j@_r#2{~P{VEJo}+3!l%^2MY^uILq$dLr^hPw`bPFH@2$k2YyTD|H912SNGb zl7n|+bF;BDHeze$-hXzZ8@BEv6+O7Liqh>GX`W5XvM{&@-lMj!%He`}d zHC7k6lT?c0=RamL<=-7m4L6}OSJ-{b!Z=+djh7&8Vz|Mq=nSj0mC&w98140{H&78G{P|2}bMI3B0Bl8V11qCT zGmWfrKm)aBJ;TTWK*2i#X~jG0M4ISG*8`)0fydIP^7qFW&Olyy>sh;=NYr-EBCZFJ zvY9Xe>4VK^WGQhIfP?pda7SvI2w{yp=MqaZjCo`3ii0T||kMAF? zOJyPt?D;Odjql~8YqaehYL)%GP@R#xF#iB*u0?$`cLr5|l>E3oDxA}P@P|#x{m*I? zn2XvfN6e2U!ZtRIu9ZP9K69=_EyS|oQCra^h-s2MpK z$Kh2kOp;Hy?%IFZ+mTI<_{O{A3xmhbwH9{Jo}~q~X|BAra)WjTIDs^ z#M>?|sFtke6_7Iz?{QS_E)wOMVtE4qDBv1@kdAj1B=-lUT2;4(04GdD-TcpxegdU} zOvLlq5RO#()~1na_tx2Sfg5C!-AA=Db<^&!@P`U|JX|O1OvI8SK^P(f*nf>aZiRB| za?Q2Pykg}Qe|itfr;-P$sAiofOsGL|{6T#_l_XY?SX(&8=Wo)RYkIL3R0zQEa=g^r zv8iG+3|k-}i?Z(AdR1xPA%o|25lo47=E=_;D7}+^SY(ep@;cGfQqW{G61{SuV-&9} zY`-i1^ZS&x@a>I*f}r}EYecY}=-D2$%+ehUQ+Y*%Ezfc(B}vgiJddSB`h=0-1d|_H zY;wxvv(8F?b2M49w6Q9AjNq{(dX6gN%f2;44aZ>BSX_pYVc)s2ot<`yibCudZNq(O zv?ZZCxmG?}5`Foo(rJM*OsqW*N@RBu4XZ1cBjw5KSEN|&q072V8~dj?_Mm&2(_W;O z(kjHvPgxAj{JK?Gwj^hry}j!DPLV`YdkBfhjPq4v0hlr-N$pmNmW7CQ3rVAzAg*1A z+ILlES6RpRJ!+E63@y4w3Qt4nR+@WuMLuqG(+4z5Qd2FJiS6VCH^|%}AHz^Sobf8h zBvN^p&RA`z1pfdb@`f^g@jR_c_j3TRERJ~E3pQ#j(B7hXOKk}14BW-MPib;1HWTrj0xuA%Wp=EK$ zYJjfiC>VoPfZNBISKgk18OyA)7sp2&RH%uvOclzUe5doJ&Rc#@;Yf?VRa~(r+lrE9 zL_0zi$8l64sUQJ@x}Y4Ilg=SGV|G0aA^>+30PP&MI#ga>%#i$_T4-!kmPy&Lb}l*| zYB-~kH^JOjz5&PQN6whpfsehKluEIu-)*O!MKW#%M<0P z&vq3$xhinmzMU!9cN(&|a*Y1~G3Yzg$gZvn7ma_52L_TE-6RAmhsFbU6s)ofmS@Rs zqjyS{h9WlUwwXc<<9;_&RpSyk0Z2S&Z8ga(?4u3sNtsB)?FGK`3@N8#Mo%U(ZxfTO}TV4gASR+334w=4o=pSzxa8n1J-+!R;Z zNWcS(ilpu<5&6DYc8No_F~vDUD(+#lw^co<4a6`+JeZ3z_2&S3)LwBA++LxD(2ctq zOc9>pdg6zuwk(oM_NS~1X$gEdGVe(~v? z)ypQ!y17yT%CR^_H53$YWPntHm0IVzJ`^x#`lWjukeIpm#qotFgb# zH)Ezw?@G#AGgy_S{msDhl|Ol%yIVQ`019}HW^ygj^4%0sD@0r6&PQM>c$~=^1SByz zBphQN^wg|2G9}!hX2wU}%~iQU<+RHg4OYa_DZmSe{$F-JzV#C$`SJ!awX)&W!*xD{ zQcqB^P{<~S!A#Ax`Jt$6LHVoJvgZh#ys$N@7}7sK16Cq-v_rDsBVx>Qg1Zj&(5RxKm!z5mRXYN zl<#Q96}FN+Pij+nC)yDsQO{6)D?Usn{f(59Y4$8T)X}rYwfB?80SBdDE`;5e7D00y zk+;jV7EflV8s0{O%vZwy01)g)r7xJPD{f^{56k}0$9k}j8(X4qRlkQgA4*F?lOJLw zePZ5u6viQpzg|97JwF<#_E!SUT70WZnE)V;{c7u7&uM3H(GZeJ8I1yb-D)#^JXUc{ zB6+exe&fFV%T;@p^)khbh0~q9jHyx%J!;!ozTnU~D~_EFSekJ@%UftEvEdoJ&kag)asGIA1tSKGu&0^0oaVwZy;derjp%5O4ve61CL5Tq%a{s z1oo`%HK6;LJnLa~B0~FqU9ub;eQMOt^2ED#WpbdNquQYIyzQvql_zdH8d-=d7!Q#b z%BTCZicDLnnM&;?21g7j&oeOOD!+4#W}4EOB2fPTF&?6sEQrVEMkMzgDOp;0VIe-C> z_}p{piqV5aaj9RcHt`+68;jubGsovmQf#Qm%f7#PBmg3+E&xzFcC3rrYm0;pjx+bb zpXWva)2HLjN&8)ueejhHzhhxdW)^r}#lb_Xl& zIt-9@wp^Y|b`*l*SY;m}+&UHH*I3>gjbph(G)UWsP3NXN@#|GCtt_n)I6;lB4nNhq z5PvaG*}d61Y+%T4rI93kjlDp?;got}n>wP#H~sHJ>+M|~wUb>zBgTOwJRop$`Sdxi zYfzNMC-1&j&rfAP#q7Ulf;n~&u9;QOCZQ5XBylpn z9oH(Kjap0`ku9<$dp=FA+>@LdT}A;t#88$19$HPA!1k+JX)@eGc?6>!`}i!qex8** zz-z5uMS)yxO}kmQ5mm~`GNHAV8)Fd220;o%ZHoxSu-oPnB658)D`QU6m2~;M_`s2( zb?1}QgI6@D?cYhVki!{vfig(R&MLi_T^yu(6evS2w;gkzO0YET-z0D%ZaHQh&#B_O z5F=l^7+;{J^AACquVfs^`I4gN`;q?uiLFlkDtO{8%W$;$5bnzQcZPJkcULIVW1pDQ zwR_0ei)&%FH0zFp(q3Q5gA^@+(2?G=c6T*t65C@lOlGNe_NPH=*H;S+V=mn(i&86F zf3(lFpKN27f9{HxQMyZqSYT3?u??}ObM9)@^FwN%Vz+Imz`Z1`f$8u{-g{{T^FhDImXwPxL1HRaUT zGJWY-x8YL7sZS1_HHr`+nC1RnPvca{6u?{#)Y?tdh3Emm($rzI^rg)JNlFbzns7W* zup1s7!PaB;M*dX$X3mAI{{Yut>rC+2&aw1~zoiy`@&ML<-mKcxq+>DX6v9q=Q?pT) z=Zet~!HqDVN>XX2febx*Qs)$glQhr^j%!0q)aKK^(1?j7R}t)VexF*-2Q=Vl0qQpJ zI#ED56q)9b3xiB??@!OQH1q2~#?#c)WOGU>0WzNTV%F+QONEv^aCsi1(yA$^gGr&> z*1xrh;amGbm>=(_c-#CdJx)bf(=YzVF%G-!Ylk=^3jUv3 zhZ992NwoWTRhnB=J9{BxReYN%ZPXnq@;7Ycq)dL8?OI7A+sG0^Sr2-5gCg=dc!NUMWr<@49g#YX-u zo3)tww1ocvN=d~D-bJgcNg|S2{DCmrwloJH_tCC=NA~5pm6AJjw_vR*lG~J!GmmQD zve#m~a+Ai9J9HV~*Il9M*0Wk&{EPmP6C;8$2<`b&l2Qw_c(vw|w|7snNin!=`FgK& ziq4xxxYMV2e7PD((3}zcKDFuBx>P8vk%RV#h7pG!HhP-H@dc!i-v=`^i8Z-aU^yQ! zIQ=@*I5afVM=PgVxv>&EvpkQ{h80FVYQ%GSIY`l0l6PQMv4>69-gxXSB(({(or4^) z0|S9fz0%^b@cp@UadNWla#spL&riy;l{OJ~Fw3&+Kul5(xX0%%K=rdNYXp4#FESK9-h-a2*L*8uj-TAD|@ zxM<;2w?cE0H)IU)S2kQtZTqr`5}kmzty_g)j@!$IWs>4H%)v()JwKHachtDVryiD; z{{X*fr4mYkLKndH&(g2jM-oEHhd3w9leXYFr%r}MQVi`rfmAl&LLNqbo|Ko9O$_!zH(12YmH|J8r&P_^R+B$2>~yP2 z$V{syp7KT-_Gubo*)o50WSy!$rmtCCHlDs@Fo7JqoTP_2Y+z=yH5;q1v`6Fus0m(q z1a{3%XGc|xICH>qJ!$SMmG9&8(%L=SWGn=XPXT>t*K24QmfY`AxCT6Q;E_`ulNN>z zvK2;FP=0O0wkip2ZQ-|hj%Qf2y*X09?CqmqWXpMb8bueL(nP2gqme#Ry?CvdEp4sz z(Hb*G8OxPGM&D!q0M@KqTZ?$&x6E$gv$!b1QhOhzShu*iXs=!lGcNC(^-lPylD@-6 zjGtw;xqDlShs??td0=kq_kR(ZwIW;I-pe{juv>XpazDHW3<0ih#1Z+rkDkGiSrl?Q z@zSeVBitEF+%b+gfFPUzy#U90PMhA~u=D5WJT1{`N-w&eRwzLi-QGt=r$9?3DEwlpM@h-`kj`Ur{K>q0MQO013+s%r36&ZMp z{{VZT%}CI^R^~V5Tjg>;h#I35QCUO2NQ39)Qti-ZrD&bW*nD9jd4eI0JfPXf1-E9S zw7GkPn*QD?U!*(aUAw(l1K3oEOQ}G>lb%nf;a6@|W3e|0nM`r-NY}G*^5V7FOLH+` zZzd_FU5_5_Gm+17YR!W0FQ;69v0W)JHbWI*S(g%OA8UoAw~^9Z`G;l_Ja(z)N4LOC ztFO#3IvP!kEn@pU!b5WG#2h}}tIw@bl55EIpD0F1FTBV(8+!Gvcw}8hVBuM2+qL5> zz;9ZGu0@0SCOYRl!9LXvDV1|7`s2*Ln%Va> zM3*x+I|o9!?@zUkIHrb6d9Y(UiFCrK>zchDk{a?!E+BDj9t1;v;<@|XdY-jH_Vycj zWz+5LC6Xb+B!DY16egRp0=(B6o~LkPn(ys4&oa#$r^*$**U*kC{-b7QzP7VpFaYQ0 z1Ma9DO;x(EypqBRVn>ZkpuBs2U@`owpp2VFwVod-rIk+Totp`^i8Qt;BL`8%thZ82 zZubivx0MWC5OO|Zd!K66u!Ko(YYwK6@e!K?lDWw_J+Lb^B#PGH+Fd)ys>W|Gub9OB z?D`62#N?=yf?3=Ey=gaeLw07uYcOcmO>GU?h(i|%D&*`f*pFXIp7!#2G`&6z8q3Ur z?pBefKw0tEarEc;R(6mSUIxCB0SJHHGaQs1#dH_f(e1W|-gmUMk%)bt-BZwiy* zuBWsTx=Q6Xjf?&4wlGb0-f_6pqtgj<4UuLsPfqsV?=o#_0ai>Q~E20_`WheP|X7ExX2L zg;?@izUsboChxgg;l9gf4ZI;^L}uOYPyYa3qbMIxvbSDdWybG(a!p;b(qLss<(Pcj zx0rBP_8?PrJ7M<7;)XNlM1&*m+5-K0R#KX7!OPTP4FdU4Dz*;k3yi5fsvEQuY?ema zjltS@Q|VDbad4KCT12s#rDXu`1O@JC)|a+ZYOie$BeS)Qwz=fW-OV*k+T%+4G7A_Q zOPH9ms>AbS9veT>uHFJux{3F(@YTQhn4=KH6ReQFC?RLd*M3aXznG25;x z!?dwmtiDJB;S{Soe|w$PGP-sWi9synhUG3|xxp*PIKk;t#)$!x`K-<|SOv#8>r-5r zBDF+#W09qExwFnYaar=2A-G6g{`z$%IPF?R$>_`FOJb~w(%h-u$~KON8T@NUQ@v}6 z;)2<+3Pyus^Uzf@3$srsP|?PR<~(!Sr?9YS;QLL(m1#J~O1mzi=4gUNj(8>;PE>(` znoCu-w!<_#-=G7jtJd;GxWe0;=55X~o}5-p#p0CgZbv**Qqsm!(WY+Ph|(7kAx*qw z^{Td(L}}LS^9EgxR^M{`{y(ZW2P=(25$WNz=?v1Vm!S0OkYig2Gn zRi~xQ-#E&u91+}+=~n=@@)ePs1@#>Ml}Z@i;ngEtyL1($r)fHatn#h7`#dLW%FGo< z;%jNOZO%BwzJ*SD1*~!I9KRD2Q89 z$|lR0qns6dXKvh@w(bGPrChhOym*@6d94h5O71r8s9w%F(V7?BdgCIql$GpNih+@s zN1vGTN$4v}Pt)O>ai0#oaQ#0@rvgQ2>ovqo(O~}ot8kFbcLE1LRqj%tWtqB@iM=X+l z_17yDyr~|QlbhJmDQ;%Y-e%BXM_hGbS$BZ2M3yq>ubdY>t3&O{mU$3~o2U)Is6N$i zsawi8^W$7>&oyw-3!7aE&wb`jtow@k4mk9uJix~CE=*wKX+0}7?Ji@4&Rh>Iv5fGw zdrMjGtY;-M>i7e00j)Gj;?=S}mU|<6iIsP^F{}L8ty5`d4C3a`?HhLBEUp+S{3T)09S#z$K5aZ)8&Kh zTPKMZmcV1wQeN3w%Vxw+ugx1l;GvetS=eq6x0QJ;fMDIoDcxSP-AMOmY)UYIC-tTai)D<+k@vyE z@+mbGTZ?}wnUw+i`*H`pU9z^fVhOs2a8vFeEBMy8Nea!g8uC)|7DMvWB6+6DF_D+o zbzfTsl3}}*V_NYku}pokbW!{0ouEP2iW{zK%e1wlZL1u>splkOnzs_B8Fw5WGAWAhG{5N# z26^DqieaN6lG-O5rQ4P5inbQc3s9yND#Py%HkzX#mnBq;8nJI7hGgCYDC}uFnXSlW zLaHUUWOW@=J${tMVt)QcdBYr$S|4RuH_5~(&A63Z?WW6bAKC`l6UoNxvS9u+@6fAQ zY!Rj~0Xgkc7L^&d?NRAXj#%Pg+t(+#r?dR0Y-PPp-PJY=ixiB`00gJgr7hAlud*Qf zeSE+P{3|@Ol2h#r26LQt{3@hEdy9lbcPDqv^J9<3pD$1}nKv_;c1Bcm9MxYq2G3Df zzjX-<^966?9=`PjM>LEVs5r;r#U$>=R%rC&PM3)5b^Sl;`gMUXK0j}RgriAj)tn&70Kn6G~2Np z{{VQi@P7(pU3sm76;yP^MQ&ZT!XsIb_al|0xLvX@F`yh{xivOHc1mPz!2F>}#xu#I ze8OVeu$?eRN^Dr%%_$%O&a!&GSU9BnR{>Zhe46J<)As-LH0Q$UQrx%RtaI&?oW?=tJxw{Ech38iDf@?$`ctta(q>01z0otk>r#27Mk84ZC?9sA zc6CkYzT9=E$V5z@bM46cpmRx~tsRHSAhxhBubCgBzi-N_$#E>!h|@bNp#~{CgXn## z2_{c5)naEa(YeX|DoEk97YMRXB1{JFEr;IvXQ8S~K)EB_G*QOMWH*(>jP7wL_7zzt zOKZtq$`K)z3SwX3&!H76c~;Z3v8tHI00;Th`jL!Qd~d$$S0`%r=M`xy8OwbQIBur$ z{I^Bljs|yrwD~|-L1pBOam7=CJh<5v3CjZ5Cyx24q-0o&1a{cG9l!xf*JIQtjz(9= zO`(2MyC*f4SeNUcaaxhF5e9Y&gl^rQwO`4K=2Ik7A!l9!j9~pL;_O2!#)d~!&N6=X zD(KTbV2>^tkmF(KduFAzM0F(-ApqXzDsl9pYo>db@=`sE#$SVn?86@51mK%ZKig|QdV|OGEb$#W?X-(13BB;p~KsP z-~-MvyN+sCyN+2f=_Fo-8#tf|Y|6FMA~){a?icw;HKBKM`%)rEWgvbvmunx|9mG3+ zU!3o7xnJv3KGzCMub1v|Qq|dav0#ZFLnizlKN=q4llXGNnfGwYrBIKTZ%Sl$EhM3K z4g&`F&w5rRF<8DnV}?IA{{ULG6E-%UYd&u<%$StA1Go%h6?P|#?A}=qXI^PZ6>~-{ zrFLz@kK(G+%*_B%kVxT&^r#x`Bw!iet}}u^I!4-6R6h#aLXS`Is8O*>Wv3tL+uSzi zP%{4j&X@s>`5z}ADd|%rO7^J?(qMJtqWV;kZk9Dy*jX28DZp=fB-$-%Ha3i|&@rCf zsxu+l(F4FeXkb;s;YX(wKFE~2uRSwDX@hpqNL~_IK+o2!l;a3EZ@i%OtoRo!Wd0o1 zgwD#K;|IM1OBs(l3Y-Pw@=BN8Yyk#3}X0R{A zEU5dO?p^^rEnGDTJ0jF~cIQr#Tb0a($^-`l9-!mWq=ONzenL+m5zSDtw_@INFWni- zbU%r$Ap~36m;CBxN2#2yNeUkItia+b7Jp^dH`-7h=bPcQ+u4baJDz#uyR?(VEit z8bwzdopZIQ8{ja4yK=ejTPfOYr5OYRslKb)O~4FEUM@CV}?URLCRDUIsMt) zo!DWXp7k_rT^NVKT=%BVraoK&DI~Y_sO{xOcicwey(u8sm*R~ChG`QroHC8yZ0*HnD>ORC9M+G{B$P0PP(EW2^vgHDZ$f}?}ne~nacm4*%uF^ZBCgff=f$?HiW zrdn7V8V+0W%j zkP|Bm{JimsVi>n3K?86Z6?zA^d!>q56<3uc=bu6iSdzI(VwNTdaLi76af*>+N8fg8DaZdtSgIm#?1CO?M;qMARL@`G@4c_$dcaR*{GN-Y^*c^IlwwuWtXG;SyKg1VjDHaZd+N> zf3&(u8*tICNIu53Z;Hg~)1ylYu*1jll=}9?Rk*cDaPY|!=amwD*80}bcNLkfr1-iv zzhCVemwZIYxS$zh(z?q(vR>IEC+`a;H=n#~fSW_NosGl9AVeSnE3CED;)crJ z0kPqY5dkUB9qIF=Pb)UyM}`p&_F$s}HG$$S8+D}Di~KmmmC*%mW?UTallo zM}4VY&t)_-%8eTg94;zc(?d}y_T{XOGB-ZV+T-&22Sw3EC zkBT`X!zTocWLHOPuiT4uj%|`K;JFwklxN%;F??49cJbQmR!o@z2c~{rm3Grju)5GR zX=QO7sGGNrpy$8iS)MGvlTeD=qO@CvN1?~~Rm~$*dtENc9U^p%KfB51uIkq%qTDuV zr0NSUueI3g{-ks_^W`o5WaGNg9^KU_DyMrnpD1Bzfrk|~9or97i!Kg`sS!MJaJ7MNUHcuRcFGQOU* z73Jq1T3~wOqP@RhOi9AlX?8rCEGv@}%{88(X(}d`or2|!+2XVsZ|uufMZmh4e8>BX z=}^NAHhO>CF2B}4`k4FuO=P{nb}UIG)3B~hyDjVWrjt>5zQ&NqeJR*3NLrX$qc6oE zYAhEf6z|5N4NJO<3go#Ixu|pRQo2hVNh`-9#{4U?@ZU-T;**MDYHsY(7N7@ipi?tR zx|k9J>qc?V({o9jPz0dl)4Op|0(#T8Vt_Y%Fn{7@At$6K`HF3WZt&)>>;C}Oqr&<7 zWy52nKl?DI>4!cY)&2AT0HD@Sa{3tEDWuYF&(@IEgaM`#N=-D=05sD|d8V2GQNX7I zOh4xopwJ@Sjh)V+arS#?jFRJa(dkeyDI+7PR#H`z_BA${e01GT-tTq-H_iU(>sp>H zvs-;z?J@He(uWxS_j*Gyb4yIkE_k2>VNK0AnqeH!1A|n7ssd^1-t-KZ8r9P8)5?1b z!@AmDg#Q4`HubD<)c*i_kSX zGFwjY$}sA1Xc-`Se(oy1>TqwZAV$5WqrC;S!+gCJ&+z+I<+*!6v9~C_4trNP2YlkR zEiTlrlQ|_ng}=N10M@8z(K7C%q{n=Vb8JAA{NL|=ee0W+Ib(kOcJr3r)kz!Q{BCuQb|Ij!ZyY` z^zBtVO*U*Hh;P`l54?H%rn(ik@}`nh3K~pqAa%u5)ihD7MY%T{gyj?*2BJ#HBe_#R zv$r~4pLX$vnPT}EjO{riKU`CFT?Px_(pi*-yMuyNhFhIsEDAZAEu;kdhaPQr39!?AGhkW zXwR6RFh9E_a=ceRrd!H%shTy|%OeJTyGwNDx!Ox(p-V$S%t}#T<_?X#blOKjRiwd0 z&T=!4tso8LOPt9%vhL@V2AuMFF`y$nXFY+aRvpay$=iCbQU(P}Zzlc3woiPXwV8J@ z{@AlZx$_@s_sv`u5D1$2uE= z-%_eS#(*tK;@Dg^`3yj!p;hwSmR;Eg+OsEjnNjwgh4$cko=s~^M7n*w)x;(*v}OKj zH(>Pqzm;ROb1adFSBV>jUO^mlnj*B+8y{?wCH#)8Of8iNM$i}W`c;b`GfTR66L80> zW9ALp2c=IswYzOFG=?MX@@>~%bhAl`k7%*$Pev=T^Il$EODp*Z z>ITsfe1Y6_?^4AA&LNua(m5v|Gb1q}IP?^f-TB+C?dOgq19s;5e_!cL(I$&Uy14sH zt8q2lqs!blQ|t8YMXa<%dX^d2NmPj;239$FBwiJIfmH6Nkz{+BZdxUI11mw!KBAFt z?;BRSx4n=FN6#DXz$$p#k4i*^WAkpLe=SGLDP`TcKttA9hAwpMd^#Wo4iHS}Q40=C+bHl_f*>cJ{?*8f{AVDa%2_ zR%EqQ+aH_emD%M3@7tw5-gmwK07!i>tSWZ9HdHYk*ECw#PpBku!*U=-y4qDSys;dg z$W*RfntK~&mT2OHN|JeuM2b!b=b;{yO{>`rC)A#M(E!dsF~4W#Ka}?ovW5z>F4bRe zQ_W`?^!uq2>+LTs#DRjvA=r30=hO->B#zd~+wB;RHg7G->VWj6y|KBTNu}}@HeIo` z&&Xe~#@@h>(v_0bS8roV(Ipqkl8>}BVWtS}`?bgz&N}S_^`^UhoGj^c2$EEHV8;rJ z*T1OgS#!GEPSGlcjo5|>)2;xmNo~?d#v@h%auK;19sZS*+;`B=x<_O)oq-v8jtM<# zw2puv#vQsUb5e_;ECr;QRU^*m$mc))x`CJGl1!tMwSXKP5mj;>hPB7p?l%F110XkZ zS~1_gqvY9$;7gg^oun)BssY0LRy5zfx!%gy+qG9bFVdxrr1JKB>$@@F4n-{nosHXM z(_txWSM3r8E;lPT%n$OeqSj}cG?HnaSe|Y3q-n8%AKpJ+D~y6P$bXctH&JjoXfa&8?p2zl2YmoMeI86 z?%vW-6t^yAU`h_8bKAZu&CGu~3z-R$MGgbMEWNTZSoib7`o+k#l^@938=%_c@&P?- zMp+}bm_DT>tntWVN7z`tag2RBQ{+h})Rl5ak*zEn5keKSljV&42YSGYYguh%M7fbB z5`xVN;3|>_&Bb(P>Io&18Lgy(MPP&+_wEmFD>_?egHF1((&Bo)nal=SkJ3tswNqKyh-(*F(NiLPs zcvjz#c?WeY!UZxkco3b%olfDOPpwp7_bF89O$ml8NLG8eKFWM|+ zmguxIMIceV_}IOG?Oct!&2uo4-L_m8;A16)bYWzGNtj2*ckS#rj}x?Wo5wV2Ne|e z(oU+NlB>uEprhIApq)cbOEj`XcW{h%AN8BFSra#taNce}Z(j6)V=Qnp+NiC#iDX%J zr^~?VTAO_|uWxSBD|r#AMPyx|{Y^InH0)`^aFS0I=kFKQgkhsNY&pbarlbxfWBOHAy z2=4dFgl9bhft=N*(lc)FLeRL8$Y*Eckdh3K%BHrO7Ll%|iWr;jlLT9TOya7C7b<0T zE3}j<4tb@PA1>J$qyZb8RM|2t`i_<>kc;*fpJbWX+f&OZvzl_#mCx+69P$ucP{ty0+A`A;N` z6A|3>s~T)mPkyj@M`D7_>?@;|_S!RVZipPq9ya5l^)+sG&?(Yx?C1APP&_Il1miiY z`fO;7B1l_%9K4MwKcz!)52913B)^6pL2+>pJ&;8JSn8*yc=L^Zt1Q;nTu zX*y@?RiwMyrm>1aurX3s_>b0^s%jr>UHf)`IssTww9O_WNf7nRgNjt(>?b2*O}Dy@ z#ORx%KX@_eS~5mVsCFHo{q40%*75w!u*>Fc!IR7zK^^f@-Q6_uNRfcCF2dsK>EpVLGN{Nx+&|K^ z^_z`9>`0O{`GjqbDAO4@>62Mr&d9XxjJR%A=_6+H;*+KqkZL>Wq)VlXb;OQ7^6g!f zn`>!utf5ZFZ41-mH1jBztf+3achVZEhvBv0O&*tdK_Qk{QtF=rK`e zQ(HZ(sTO(3=9v?ubBUr=-|9KVSd!*Lw~&9}G?X<4)r_=}ceng|iCe3D&6=ha1*#e^y zEW46L;QIHXJ*Gm4mPgOO)NmDuE9B#$r%M|wd@8S? zBbue5JCxc&VJoq5>Uz_n0zvoj&pG0p%PPmhXWwpW25BW=i~$F>Dum4dHtq7t;2xu? ztDbB!s1w4tQO4W=ex|8fGDi%7_-?@Ur!xj|fsjW6v+8I>@q-{_c*76wbBZUo6Edu_ z47YQ^C-A13Lhj3SJo{1w#vC_4TGHe)Yjs$Y%Nh0*=~5hmNhHw<+?d#;dyxI(#X|Bp z!NJG9PT^u!lQM5u7tpgY$*AQn=zc{j*QYfZnq04(t%LV+I5ixSJc`7U6dstxI}@TM z0ymFx$^rB%nnfrE;&~MWcM*NiuB3YOG{GMEH|OXFtyQc|*>!GaShns{&lz55GUE^Y zeT6m?*M2fPVul1MV&`|JI206-ET$8QNWdc-NB61}EHFxZv@3*F8$8Awnw8j23qrV2 z&U)s7vb52z?1svod8v0?z@ekc6Zdd`Tv4|Q8N%cHzO;;ElG;gkDj4|t*vGX}B_SX; zHx1;*?pXZNZQ$eHryn$dS-I$?jb)|%!#DcJ{5s>(re8W({KZgJPa}Y6Yf#W9W?TON zzql)Wp+Af1R$kZb7m~{)`;dLodi^T3w10oi$`Eq6>(`}QlHEoy#gP5{GUMx-M3twK z18C91<}{hcK+bAgQ5?D;LWQGQK4&Th2<_`uZJ<{I8*81Yy;&dmk=CnA4anOA!5c)b zSV%gx9K|KnxoacJr|g$JH^}Haim7uG!wTEYX0PTi-`+_Cb*pAMA#`1<$Ok#)N8wGm z`?!(TQ@ifS#L$@9(9MqG*=A`cwFOXOG~NJT&#f0G88G9_I3FlGc>G026|B={6t-!1 z9*8sg)91G~tGS2H#{mJ{MQE&I?hpumX7YD;3|)a=$l%gNBr7b2X=FJEfIo}rRUYA! zG4Z*6_tt>>xjY)H3gwS6ACH>{zA4d0W0Xp-_#Wb`C!si}d9MV4gOJNZKn^8o8_SWt zUR6oXD$7SIMhvk6t~lM&p-C4m;c!oXTC?_tnm;d`0iVK?V-)~EhHC zcD8ohSMIhBDj^{mjNk_ERoW+alPFZF033Dq`c&9VjQOxi#PHau5@ZXBW8R*b#Wfde zub7VN4) zo|Ro>!TbLJEe*Rc8rz&_tL6vX(Kq{dyz~=QzFMCykq_1eJLo|ce!E|BV#OwozO;J!0Scw?VuTcH;(w^%F#5WGr~d_oSO_&Dk7NhVskHs^s=#z|AhxTK!+6@C8zE`N)#{A#q5{^B{L z4AHj@&jO}t$JwQ{K&dVRBe3~#+NcP66#f&m^WK)$ zMv~rjZK@-aYUim0aBDU|@yMaK;h*O#&d<%)69JRYIz_@3cl>QR%86BOy_S-FgEi~zJ^1m%%G~fCssTT zXoBS+68V6g>9-@Q`eLZ+S7vJoW5aI1=hvD`aA_j-v&wDaStA)m+Qc5T_CG0Jja01X zX0ugXsr;(z$W(ggwNpysGqrF;F@kyI)tgi+rrW+V@^+`4J}pMw*_oBmfZ~}nFJULy zk(rTINFDhZs%r5f%Hx9~f?qlGt*;XP?^G%nukNGi^sFJvsdM*;{vUeX@k6AV%%NK* zF|#AulGsux#VjhwWPnVgoE|?~h;C_Fka=iZ6kx<1V`AvWbV&9!&X?`7!(jT>JQSXrb$Pe|YMYNaN zq%+168+db-L4e==YTglc3!Nkivi$ABbs6tfZ?zrg%Y^>yclD~5dWV_&n3#EEss8u> z09vVT9dnwy2$h((CYWCp6HAVUqG^5Ufhm7_zo%KJmiPC6Ewo)f+iyy(rC1$8cHVYd z$MZ`5f|FR)7WP$!+-O?LZdnt=jtf8&n*~*eB3;t)CU%DY}DBi~UWi0fGJR z{{U4x!qBa)wdS)r<~UWAKdAUDKna8b52u8 z2ZM@S)OpQ2Gyx`Q!;w(~PX~$yZ}?>5OUCqwe=||(0G|%(e!Twx=rt#XzUBLW(=q*O zJq+i=`k&+f0HD@AOGYI$nr?AH=C(#`GnztaDKm-yT+>OUrsvjxADT)5re~U9NO}5K zPG^Tz@K(uXjc%+@$gJ2b2dCj&usl=6AOVci#WI&P40Qx#Qf7)kGal3figBjnGywH# zly#^EMLZe?P6-SMY!T3k+}gVRp9Bl|2mSn7=fSD1t|YX$Nuq2zoYag6+Qcx!4hTI( zFTHEuTgR$ee`#6|-!J@pL_eKlDWC~foH#kC!KZ^rW|gJT1IeG5k3;?6{#RF_P(yjY?} zdxZ=0jDhb_>RNl)Pli&^pSs|2RrmAqt`v+`O*G0!OQmZc+67{kAsqPVry$o&qucMg zf0&Pwyvxs#TX#7gp0(U~A5@z5>Ub_2EU@lZ^%)#hbGuj+Rqjlym0Lo^gtUP1CpOAp#(rIabg4 z_pBubyL(@&gkaulx*&Nq8i#gH@O-=$X6^?3^*KqLL) zhy3bwzJ(VWFj!zNN^_Iakw}g_COH^kjKGQij^z82M?^c@@<=HtznpH8+MYXMVW=+gq`ley$jl+-QLKWI>H@oQC4{v;X^RzC$&Qa zi(~t{Wr^*r+hVEUvF?7Eteee3^7V^4F-3NffcbYZJfC`fq*GkR(kA_&jH>cawOi2= zoylXjy^?7rd74OE@}BrOIk&ZOm++zdlhB^5pbZ^%&&+Dzyar z9W;&hn@Df`w3bjtS<`RJk4n!;%s1jonNmAu1jy1J9f|o=3ajDkie;=P*4^)2 z@bW7&1&LH=pk^c1s^C%7poR6z`Mz71l6|QRUwCuWlhdc^T|9SEE}F@298upYwp=K} z40Nm%p5_fR?SW){>KSA1i@WcST+{6J9Xb5iW}aq|m*!A}55}>p+_*94Q`7EQZ0)2^ zKGlAIv=R)mci2~~z^Gfh-WcdMp04LP#6?)H3R<-l3uRQxkL*+g@e|5dbO3a&3w9-sr5XU{j zhL?COy#3Wap1H1t6D#YX()l*R&K6sEh2_HUbR6v`u{BoW)xO1ewy=4Vp^jA_-WkCI z-mqXd6YExsWc#DurvRMC>$P(XP>FrobjkYvWFSV}4L3!oJBCBMQ zHoF{mAC+ZV&zr9$;T~t)O`pV1Y*R$nH_D4hb`U8H|utmmoY;ct;~2}7~p}CpXXKutdr_jHrB#4^0V{1bGwhz zt!g{NJ-xCt5Ihsl^1QFf$0Pm&p7j;Lv$dM?))fttw7M=sGjZ3wXU?`XRETXMk}XLr zWuMIRK4x9ND*U8^58+eXM;tIjjjH~!Wkn5Q6Jh$$dSlT z-d&-(_Ulx&TUiazmofhGAY+er%*&6wM`2Mx30;w+?pAHUXFGG-+P79}<_&U_S=+^N zb@rKT%rdVi-e&wCekxn5+lx#1btki$=W%6KcG#SE+lqX0sEwjeFfMXP>%}gibtV0+ zt)OQzDzh=&&QBC`U57S==(7Et%y5Sjrs#x{<%#ai?VsgUO}oVRDI;2|Mo%e{CF4A2 z--=5K7T*ayz>{li{{U%Mzcmcc5c@$AzUYwbPCjEv-u4KxsCziIDA(+eM$!z1KQ}+p zrHfUw)9hrHG=h1im&;(kaaSCltwXim?E{C-A`^m(Ki&|j!c-%bM~=pf?J7V&&pCJ zX6f4iR;{DX_CnEor5<^ka2&VF08l>)na0|pFLDTgw2jnlL!9mJPib065VL%%%};%& z%N^1QGc!o4Kv9A{xTt1;%<8U!GMsE))hbt49f%BzD((sa9@PnNQH9-%fzqsfof1XD zf)A;t7#k&3VjsLOwM|vb+GKY2$kRK%?VR!HN%O%WmFJa^AyiV_el=|^++o;c{{RRE zv*wYfogo94Ju6i^>?_#03rQgJpXV#n=~m97erVn?C95tPCTw8;0DC^QLgH6~FpKx_ zF^@`Sz}C&QysXK{E4K!xmrzq>YLU2w)&Br; z=1_2;_NZgMxtZf8FjMaW$C|r6s7u$48Ax1mz49v;?zP}5txt1twlmq*@vB2q(Sg6xn1QA@4tlf6?t&|PCV2j05>G6C093VN zIl^QfvaU4TRy zWQIvmiQbVFJAFCpSGg@}NMeG}x19`TV0i$YzlB(^nBT|#jO9LZrTT3ntz|_b*=?3_ zBgoH}^JEME0FKrBJx*J-m_cn6D7fJP=hW3TphcQ?Qmvizh95DVkfX}$oc$^G>l{}i z;pA|skfUcL;;9Ivxry3H+$)|DbISXgscmoea`1^u{Mo_kzSWm4%~ZKfIZ(@Pg=1uA z&gGH1b5-KENBJE$D{<1Sh}Cu(%QE91F~}4dRg9}1>_$4AZKnm%il(}u8q>+f2xE`R zp|~YVtSF?PzQ>YkwawFo*|FP<T~**wgMyHkzB47AI2ghPbe z@^l?46p`*ED5@|%VYq;5Xe5n}#{eAXr7a3lcCj*CJ4&ZxD<>eS=}RM-7tXmWocCl5<`{a;4fU3)JaT^7W86exy z!*fiRWZ(%SiCY}3VD0>kIxi~CTXeiPATjCmsb`2^ z{uP{>y@*3@5*CR7F6)n({{X#IVqprh6=lal+?-X36u^G=;yVhiBv&oFrCd4r8ltmo9cj}%kSZ~aoqaP}^e1vgcu~5H`eLWIz6|Ws zf-(0A6lpPPN=S^qV+s{l%{<6QEIliqm8Yqt`VE#sT&}>O zcpx0q(HnHPlX|;ke+C178K@(4=2R=Sci5pu|wUN%S8rb(rtO3W_pCCZmajmJFknp=3BlG(^n z^Al6tJhEAdiDhNy5_Q{AEyQLel&i7>h6<*kHpgoV1Y$W-XV|fWmR4Nuq!8N_k<$wM zvZ@CI>rhU~BX5YXO#RaPh^L9v#L=|4Z{gzcZT$uqfXv<3CD|K1a^12?qdy zNFrk8nSOq}^);t0%*_a<1|@JZed;-GUvOZGzN^-o6l`M->J_?RH6V$}MIepGmOZLe zQZD6)%yO&jC4C#xk8cC55D$XgJGy3Zrd&c}cUDJyCN~z1kpUA3w3A z;upA=3Z@_%SaC|%X#-uTHq zSLsc4cSl5iSY?54m8q#SHj*+Y-5wT2+Wo-oNT+)aazO7+$Cl(YNDocDg+N%Y>~WgW zYKbn!!;;J@NZ5IUYSkF#U{!rRs^n5gzjU(a(~6d?QY<~Qvm-A}xa(F`wn7Fl-734@ z7?po_y_jb#cr?YG*|whi@@g9bU=+Rq+mYIlU1Q+MyYZ$=@^Yu1*<2j_tF@4TXe}JtQ0x4l_M$}B7EUTUCu>Ib| znxpnM0gl4nU*7hulq9Ib6;Qni;+Vl53cHYi?#lHa<4sD%Bv6JF1sz?G=eJx{*0y_S zn-AtRs9(#GOBYrCCCBxri&l*0Br8a71Nzc=)5y&tFB#+psZGC}gKH+xcoY-lM&xZ094e0I=|XoEh9&cDDKwIKY;eFF zaqCmVaTTTZ_laE)lDRm~p~X~|Hy?Zr*x=@xR%tTM!Jj>8+T^EWVP=#&Igv`SCz0z> zT@x7x?!I~f@{#H)j4~o_XOIOP1R2hK>i!m2h;?K@K6ly&_|{EXnk`W}TyBj>J5Svs z+K(}>8!Z~NfpHm;cP~%UqPMs6NqI0p=%i;Ru06K&WZuy`bSuzONhVc_aLp078{%|a z72Hl6KBAiJvNAKHyvla&IR(C=t_W5bcF0VRzz3~2X!A#y-d0ZzPBLkgxn9OS(hOl$ zPXzkZv)v`N!G>r_+r}53f2~((luqj74cJt1$utu^$O1_^{Nq23G>P&EZAkOan#$e5 z80XroTOsohvpyJZ3eBAM%`n9&oj1$7%tAu~PEVy&^UFfA24I7X1CF%p!5XP5c~^TA z=MoQ>cW&fWNiHzC1P}IoDb|5wiBclCW1OGiq-YjM{`d~Oe4>&_Qe<$>qE#RRlb)Zg zT)DUc50Eows;R-O^8%>Dvnvk0>q0w}SXwsPPBIw#QfsiH`-QWt44z=)X&VU#=qgFr z?s?`|%kF0*cGl_G)@9Q^5FmVGa}X+RTlXGJI3-@qjp*zmm*gE?!`_5=B)XV z+s7=84(VjZ(hfHD&p(Z3GUQ58&Ledx>x>S6I+-Ps<^hwrqhpbd-{&=>h!!!v*4cvw ztvomH3oiuqslAG1YZvatyfQGu?m*xWbJcx? zSMr@(6CuZOQ_iL^vNT2G&+v5=I!Q1Nt_o+7(Bh76psY45hypU7m*!j!PJ7heZg9Bc z+N=33$TDLc4k~nwV^Hgo0LD!xsS+lQu>CmqtZhPB2B|QPK#0l%2OM;*h-KX{3c2Go zR`Lkmdp4d3o&2Wq*!3f?6{Wh0GU&_JvKB_hL8~t*b=%*zYD*hv%qZ6YXn0hVTa=A$EV^1ZSF#O;Sde_psw9F4;M)2)s!gdTZN0p9?doUpsHiD$RzyB&8uY zrZ&Iskn(a6M@mz#VEPd{iwad(Z^UB}->*E&b$n@Qkx{J(POYB;PEl z9BNueJ5@(>T^+>9;k`5L(6~B15oGkr>{O@>#NE?o%&;-9`xZRH>g-EuO z_eD|e>;5%UXf)se?TP@fVR84i{{Y`q-kfHbDb#@-2ZoL;^%6Y*{{Y8YnV=l_e^EH< z82Yz@ai;1m@BaX@ns0}>&@~Tj-}(h&%?^Ee0)s&Nt3_;PH|l8*6dtsOfuQe5 zXwN-q8?6EgF+k#!^FRnX)0a4?$-$>{iU5;5QJ!f4=|(6R;B!W4pbA`47~*M-P85fl z0B-InM?BML2NVG^lisBq^HqaXq0JyK>6h|p*J4HGPY%D{6?<3Hqg7o(&j!{`Sx4|1 z#0*VqXu6tPF}1n*t)@6_-51`ZtCRs9GAYVHs~WbNblabk=C_lWn0k6uHO?roT%UT; zzXytjoCPZm=AD+KA`PVtpv@CTGv;O8!nif2R+uT%2 zC8;xTh;N`NAP$Ye2DzD+v+u)FHZmLfQ>s1-G zprs4ydPa_`9kuC`B@Vz(=4&ZPNv;=YAH6Ndy;|`8r48kzj<_E(iBY>6spYSkF6tg^&K+u%3_m8l@$tH=i02Jglv{lM-`i84$ANyK+k_ljEH=eZOxqE zEj_0L?;`E$ImIy~^AFwWMuRKcNS2!AjDbd3*>@9x(x0yRX`|X%teI)2l6eAj$U9f` z6<<+SSBgNS?2rTPR<9z8JJnf1V9}AZ{{RgL9>y)D(1iV$%zrU*Mn*yURoghO-&Bg` z8Mirg*%OVR90AQzo(U{xS#6z_nNasE}D_j5w63}Oa(5AP(8<{qJa`Rh_kG`5QjhG|<@pXHDMGYbCz5Bvy>aoARt zeMFiyuXPKpE?Dla<-3wN1?l$*cNQrr_UyXba zdiKRt)g?sptKsaZ&t zmeRe%VOmfzyo~hPM{HJABBZ%ZBJHW#X_H*|g(9}Rdz%-EHuAuXDLfE9ZvAVR)O0`X z-9iX_`Q9Z)C}UB%@Gw4J+={K??RBQt?b81MP?k7m`EtMd2=_fJbHtKaT2FWO3!Re0 zk+cCAko@74bMq0!b11k@$kRryW_U>bhI{=roRr*>V7OwuW1MlEj+EQ$7n1%)j_MnE z@>QgNm*r8DjPOl2MYvIZ%`Vk;`|r0oKIXJ8Y+%3EG|4Vkb%ngM8=$M5!yAD7dRI$k zX=|Zg+FNh2i|c6QNiHs9g&J2~(jIpe$3a>*4Ph0}n#!`exoD$cTo3kk_dQKuwy4^! zucwxgV^}9LM{JN4FeyAEcI0=hO;+#AfuoG?yp~W$7!fWxIr@J(#!0&mVo0%Y_<0@P zTo#Ocst;;y^rS@4&1}vi`C=UHJ&)^JwwiQb+S1yaXJxf0+xr&U8wc~I_=FKOF)R*& zH$F(McI%3Ym>EM-!)YdGvZ7Q^D08uD6NWMNsV(en z8f!bdq}>I~jpoO@K2QyX&(zk6$Z9g1u~Ij=hvd1pXrw(d`7Qnxucq0}FP|b9!4Vdb z%kr!Are0|mms*<6QAnNL)Qp7)r=YFVqeZ7hb#rei3oWy(sD5HbIIL$o8!R-IwmAzK z!^ADw0hF#newEPi`1+2_x?R0Klt4 z(jPkI?wppAL}%S1?JP0!=CWm!UfL{I5f(C#pCWS14t`Nl89rjUS19XyJNfNDe9*{{ z<=NGTABTFsV|5*tP88)CMtXurT9zwcv9!W@O6DLHH()Dsoc^b$#R9|9eVGXofljn5+cI#B%d|AY=O9!z7P^eL;#+5iIU5)fays`Ir84bfUJE${^7&JFsH7}w zvy81p;1gHpH}>~UE!E5tO28ozd28Y2pQwbF2B7 z;880zpraw__DLX6@cC>0&!s^WaoO+k-JTX52-@GB4=cz+$hcF+b3{!SmUk)^&wi9> zE4IlEwWu5L>VZOn9f|&RX46lxvXSp1-ez&M83_4D(wdhR?5(;ackRYD)k~Nz9wHHj z_vl#E-M3=Zk|&>2hSp{tNVtTiBvPY1`cyGqBr{DcR^KFx@~J%!YGT>q{XEM%i6P^d z+z;nkQpW_hG6aaG2i)CHnMI9VqY_k>^Lhy+mD)x7t+Xfj)|d8jX!bHOG5O8T@4Qdr zQ`(`lxp*UlWsY642?N@a{s<TxtK@GdjNV?t->i~^N!?v;S;S~kqz_`-boTE z7`nuYF_mAN>6)+STu*g|kz|h?9e_9INT`8jE?C3{L}UU*O#>JsZIO|WChDF=gHCE~woQ78`kKr8+ zY}-ViX!4pIl^u5gPtu~fS)hTGBOqhT9!RAZsa&^5IVqkdPm?F9VAQV%_KeGi+7)|| z$Kl?ecFaCf{Q8qn1PmkCrd*$yV>l!6t;1tx(Hg?tk!BU zxnSy`f6Gp_RwVZLo}{=qtoDg7rrRNuW0H4qR;EXG{q(yJ zLBQgkmk_}o#Du$J?_dJeNwG3jjZh*)pSo4YrBssFY@y@M?@ibhF}!&IMM)9!{3A5c zArl`lhW?m26w_ojPY2mo4K!^t{{RYSkH)NCNoujOwY*EXaPh7Ps+$G@95qDE&&<1r zt}}|3uEOP5BxE5uD!nSpqd!Dd2dfiKFuUQQQmOst8+fRE=S|Uv-aYxP6S$itf+GX7 zOu&QpLaF{{tjh7Ue3b>dWZ-^vo~iS%%nP@tMXO?Kh*QcTSx@)SQkLUPjER*-_d^-! zvBMA2q=15@Tdh4+C5VMZK)r_*UwT4zADijco$t9;Ow1B851j_X>b+^=SXcaj^8q zwPw*pxBcXx>}{CipSw@;Eu@LIlCO^SUG4)0npv1J;Uiz+9<^XRPaaYkTO4!pii-uv z9xyRC_Mp*7o$>$*c2N>N>w3b;BlH$@N8@OD4RaejxCu_MH1!qZS;~U*oS3g1h zXjvp=G7|(#+`se`Prd|ILmMgOhCh{8l_HgqT^k;Q)}?*Yv0V0BAz?;x>{+D!5;lnyWEfaUbzAKY!D3^!Ea4Nc>rTi`jNiGt3PURe zA%@^TI((MXLv&zhC0VdOLb#)t=0MRAgY&82)uS>sL2{Ben2`k;#_SXHsNoE*>fwW9 zBYEbhdlN0u@(`HwxloRA=}lQ5?Z@qCLa#(Y-`<_V#P;Igv1M?m2PC!!(vjsZORC7W z%16)riPM}taH@O$K!rP!0@kt~s%Zw2taDOh7V_z&hjJWwuYFS9P zk;AbZ@_N;H3&$d{EXsIgJ*s4HEnnpualpqvjXM(c7}BU=jPf`=s!$P^iBybqJ&jjc zU6?3bzg!N~le|(NmvWXIj-1o6ERJPu<0VNw#a5O!UaN!NuZ3NpaDDSq2v+2TRKuXl zJ2pf54{ABA7SPPMK1_;Os_x`BUMh1g{Orl*t;~qFVZ$zaQZeZZ$k zq{fMos762>3RxlpghFtE1!1u0pd4DJfs@WY#??;{*L~#h@VSqHq zp=XpJAcIzf0u~714{~^*5jq4mh**_x?&G~n2olIk$TGb*ahi%r5Jj>zBz^1(YVf<; zjyEyuv{rIO#I$5Fs+Vt@JT7Xh?3vzO=s-JFer&n#Qcbklmof~Td0?I@Aa?E8FvF-E ziXxGtW7q{$*%;mc9C}k`x8BVvee}lMRE2m^j&r*{)b^4virgZxVE6jcW3`K4Xd!-5MhAKY!aE(#LG~3)hr)b{Mse1sjUUaGwmOaj zF9Zq_NNwDZ%@S4cZ=uY`<;ft9xcXGYAc*;?lmk0J$u(F7i56JhTouU2 zP=1u{eT0baqJKFqB39U{IRiXnsXgfb09OY6z+rRGT6)+dk|f?_rlZnhsyh@$j(J++^k=}+I~=SxfnH9O}A-gawc7ozhRO_DP35bGFy2S zQ9`$UrA-&5ZXP|{i2dprOYdTLWYYmL$CL96XPRhtdlM=Wv7i`{hFE8}YO!q>lC8(g ze(@do;;2J#<%*I5nD*p%0OV$@t;DS>I|3EC=jlz{vRF37I}DA?2pOs6c1$T5&mj8N zdAt)#5i82JYeG;Yj2GS0pyvXmpt(A>+j4gTMp_W8KL*$Oat92!l0O#iRKJ_}WEem2mQ~)q~Vt@zD+dXO~$|`)}y-4jy zB;I-At|g{5R3*!j-G>!#DC6cB_o_L?&JS@?Gv$4&Mu&3CtXna)nv+g~Gj|a3b~aB+ zfnN)N-P6?4dg3Q<7F7p$Wq=1J@a;wn2~eg#Hu( z4H>HdS&zQbpP;2>vIFILeu99xTQn0>185K2oA4r-#vEY;r24ioKnk)T0C`}ysj2?Z zyASTrA7M#qA;I$zG5zAlHFxZ`!4LL8U!AR>;o72*ohwo_gY#X@CyC^bjd{^rJL{ zGKviyX^beyq{SygK#5g>AawfHhNYw4X-gB_ta~K;tdF=>S6qzOYY6_{@M}Z&u5I`p zyO*ctDGcNHW|U_Xz&dBWJdV^1GtDI({ip|%NCuidw5FSyU;yHttZF{A;YbP8;+@4n z4N4l2?sQs3+&A{8O1$SI_fC3`{80+6jjvq5kmos)3nUFgPQ=Yo?8p#9DWj z>=ybF!T$9iFaga0uxXAODVjG0WK)I)lpKyID&zN3bL&VC)|aknFf?p$S_au5FA?g# zp7o+$OIZcPIe3WstJCXT`F*G5$g6f*fs6NY4beC0?@-msPRB}@vD_$AgSc`Jdgk?w z5ZgJP*%_hh<%h=4{F-kR)g4fHnuzRTwne_`X47B7@M9KgOZP#im`;o;z(Odv^-X(yRT> zxb>*?-C2&IcMID{k(Vx#zsk7jx1RLq>~$pr5Rs2oT84c)S(VSqyH8Kv4k^VryM(OK zy^|(zF;R~(91?IU3lV&=2=1d$P<*bEs}uFBp?7`k1L>UBx4JR4na;w@Wh#)v~=Rq_b@KagN6|jk*-wi&V+q*cx;X!GG z7cXzJG)5&`8*<*cZ|W<0Dc%V#0Jbw*$fM^&+!5WyCqeH_|#q>+bD-n5c)BM;)nas~xyeQfq6D=h~{<0#IneW>4P z)-p?ahBs)#e3=+5IqQn)G_w?1bF@*(w$pOL&-_?XvA-Ptm4yz06@{Eq{h=kgkpp=n zt7LqkeKYS`P~Pm);)y5aWM!3*lBLH`3FNXS1yLTcqbhv}XnxY2B`Q%|(CTZ!?y>R5!1qtSP8KDeoLU2^i)2`wXa z7jte9TpzxS$1Fb{pN(P`2-wfdK_Q4LK?91n1VWQGZlS%pk*;kaF`Nc?&}Umr;<+p|aWM8R{!G zT6ZJx+uOseJjUT}F4`|WQ2oVXc>J@P*zsJDJXQrGN1jJ3BLj~!jO6?070JaNzJska zvs^~*U6E5KB}wBvaa)@0xw+EqqKG@ZvZ^ZX+8vp&273;fsBpZl%Tge-)h2CDJ3EO( z&u|pWjzX&RT>90mXisXBT-jR(aHV36Np=q0XCBp4OYoMZb8_0O(fL;v_kU-R#OkOe zQ-gukhfiv^uit9k9(nCd(yXs4#zM=L1fRWAbmwS2N_R`%G_T^3JYO)CokGr`xyMMy z_*Tx2up3#0v_fOEC8gkQ?bo+@tv#rhU%io@Klu5Tn6b$``cqOXEu_G%(f!svGx=9V zdY&9%>0Gm@-Ffia+k@s@7TnVGK)pZqTITggWP4eiBp)xeZ|t{}D0W@Qzq)6Y>Ui*liSd4*f< z=hK{@rFNbq@Wz#6r)qbQTbGrglrPD`GXq%v00^y;?n&t=Z7!w6XvZwtmfJ!V`dL8-4V&?-X#GRWvzW%cH=ucZ*_*T;#|%gbli>sem%)a;!^ z;i1eO9_mpN!lh!7423rVw;q+Br}=j}v^LFe^T{fzpl%drocf$sL!;W41X<`}M2-J~U(k%b+XA4=ll z)Se5$b0lid6Jb=6G21m_!RI=&T}e5KSLex6!N}}!S#eq3UR_)TWp_|zcVoBZC#`2n zl8g79!g9XHQ*Cn{{res2*2aCZKlj%iRqAEav4bD0l28qZnQbA?>B^`Dv`t8M;JR;WrjM`*3WNpS!1~0la7QR)|8ddL@zE=b4H*CWn!7`K&+7RPQH1N zDmEhsJCG}WJT1c%ju|FI!mOh?9macBWOml~5TeOt1a|&wNFjgILv&op^4 zLYq=79zK29%|B?Y4mKdQidD2vG23=J4nI1YJGmj1S~(IjNI7NC6xnhkIXO&zzLi+q z1dtR(ZN16qQuQYKnwJ*{CAQ3x861|(r<3#@g+do}uPloEw*+*mpJnUp zAMmZTw3QxcW}RK3gLy3UAf0P502UqZmtvdeC{Pqgf8Wbgj~>r^cqQY!Bo*FEZ^qttI> z9&zV0Aw2&8dlbQJp*L>Zx2q}XSLL>hN&AFB&q67Z*~2K5EXf;2Ka4O8eJg93*n&fM z3Pcq7k-rNdeAA-0K45%>m#aGQQNmTEa5#-VRRC~JSu8xa+s7S64TXjuDsPeGS7FKC zr{$^DLq>rkD-!|&kl1({X{3So!LZ1Pa;%?{Ec zrse6__M+ZsA&3mOqAN&_ zC1jCfC!+Fd;e{=Jbih&kM-?)C4K38>@zb1QpSvTE&WnWGCe;}Ox$jE?ZUtC#TbU#~ zbgdI2Ok<4l$_HAD2)l=ymLAnCU`M#hO8N|Dm3GXhE7P3!r*X74(kC)xOwqTn>rW66 z+oT|$dX1%xR~r~}!xM^pZqFMP>-;48&`Tl(R^(y&bvt@ilr#B+4aBFZIW?LVd9yT) zxQ?N?{DoVB?bnNfc4qYEmN^WE#BQTvNhlQFV1S!%T5?1kgmoXxA&&UcLQPm%M=DM zc9GSpU-UVVQW;9^`^uw+{OWG*G>VTcxOMYnDo3CsN096LM%o!K_Zi!vI@8@AMFgrJcK;B zITa+fQOt7#h=;6;kbePH%quo$`NiMnjAV8TiqX zBa6*a3Fbg>D|X7?>{B>~D5pkUyXFir`5Ku^p;2ATT|zIl-W#hKCAfFzVljMuJN^}p zRm_C8^=$mX^Tl+QmT>8&>Q#9jM*jd;Bo0s}eQPo7qmOKp+}xvov20jp*Ey{tG|7_G zyKb(JZ!}XlF{wERfO}PGA^S<3v5{mgmXJmO{0&|PNbSrYX`bO^9X5q4?@}9=v9$=1 z#v^DPDrfYi-D8&f8GO*|qKg*>xub~-JYa7H4 zSNEg?_zEqgH#1@@b8M}gta1_!Cv~Wj2yN0hBzBi{_`)JK(x>ZJl_i!K%ESVw=2LO= z1N`b3DW$Z|HHQB`h!(6=&}&J-RGOyqjh-sM=kZ*8*bBwVtV+~!FM{aUO` zFtnORllZ+x;rL>++G*Sc3Ym`^iXXjBV633ckh#j|rAv;tW4UG#9BlF#RLFTmQODMu zJc0bM&$KWgo%=xg)kKmNl0|uOj#s#9V|jNsCO5b96CWupftopbh0P_|EFWUHWx81T zR2VybK5Amh@y5`v*;&cuWsDpTP-+#q)9s>@?D~47%G+Jxkg?Cc?xM5O=I!qHEy43w zp*Z}pilr7yk#jE{&zo@jk&|Wsz{j_xQcHN!ZGogA&J;WD0R3vgmF0OLm&+kZ{v2Vw zb5)WLv_Rff)NJZ^DubStqT5m)acFWM0O**;VM0N1SXAU6SI`4pdsUeY<<#J9sXQfubP;kfTPDw*1Sn@ckvJrH;IXjrC_M%qv71p+?u0UUEo5g26ZMtd3< zM4ky3?Ex5IoB@{2U2Aycx}VBywXA#Hl>O!O$9m0Zgps-thR#)mShPuDW+A|Bgcozw zeJUi2iqwi<^|30rWOd#5a8{B<7VHYPK^f<@PR^stc+pr4j5)M@SlqQs^y=>r}B$mb%I?+O@f zT<}Od)N)Sl2VqexgtjtB6j&~4#Tv(nGmt-ooNl5?QKVEU9Bj>uFG_}0ZzV@RF!@bL zTo?urZ$;XtZcjaFSa-gJmUmXsUKr*(R$Qol!1|hznH`m(m5Q+BZT|o~)h4n@%%!2j zNSsO2k?m8%16uAVP8E6{r$5%FOR&-%H^}E>HM&I+!!OOq z^Itt?Ki7iJN@?ha}*EBvU0 z_2(Vx)x@FG<;L^LAaXvG>}ZwIPJcGiR0wwhK4Pb}R{qhP%KdU%Dt6>IA9U5b*7Ffq zbOWgCSytr7k-f;?8yVy4^{rAyZ$ivB^BJNH+1)6}#xcmp>sxoRq#A|jC<8VM41X4D zpSV*SB+HPSvb>Ds5NoK_Exy&_Xy8UDE*%Dc*&X6e z^pXD1jS?qjQVOX>@ z=t|1%BfGN_6mhhARa=N6(k#jjI_qF+*)5DOU>JHgSrstje8B1?1K3N+~X;dB_nXGWlfwwD@g2n9lE?B;>H; z2eoRYpp2!CMUWks9P(>o7u{@wsm3sQ!1SmkQ5IY&eg^UutXS-LnXt>(f;t*fVq1_j z7Z0hXz!oJ?Pi&6jnzo<4XPL_|ZSVihG&X7=a|5WLImX>1(0H9$k++E^tTPt2VZF+BK@}i9XW*05cq9=lm)pV|6QF z!Lfs)CgJ$?L|J?2fDk8+Aw#@ z!A2+pr|{;PX1XoTtEfUGKnXy{u>SxPT)=(#d}BQ0)7rZ470Q+xJ)V;Rz+el1y1(LU zkOLj)0mqt7D8(f(9nzDUQPPS)08-XXL;*=cF18GMhlf4uSQ$Xv;q@*U8hn!ab0E6%T&8!<{ zKPA%b{{VpwmCpnJ0P3elE5S0&9}gP_U#O%8mx!!?v$dH}E+Uurk^VhL=UKz+U7v}d z8n1`#wOE%vUL+)cb-wWY4RKxQ0okO_J!s;aoKP|6p7fNRX!fK6W|uV6NtytV8hALU zL8q%wGGKF4Tii=&aFay)T~BWHTz02}#UMvXe{CMOW;F{j{*atH4*u28DaosPcA)xo z(3N>Z!>6dN4Qjy}A*xy7qLb#3CvSQ~GCNY#X)jJcI#Se`M1qrVIn6XRF6uz6XxQKV znE7mXMBPWHxUF@Wt(j+l?v1+RqYCBf<(>Omu~*ARdaXpGa-%C9YEKp1d(iaj?NG20 z{VOdCLc(ii%x9miTKjx56&*PFy;Ra#XvWp`MV~cE;H@J_0LwD3ZcSzCx^TKtBzY2n z$Dyurd3=J=Ns#N*cd3+`8j5FIGuyI}=ff73YNTSFfn; zPOzE?H7k37i-?pU)L+jl*^^O?xHO zqiab?*veajjz>Szu3HK_oY#cd#jjq+$IWlM@?rip)x5cEbRDXh@cpihaTL(*GRTB9 zg!KoUVx`V_B9wL;Sz0rnpvG7;9B(r;(-)F$2GLX3ILyBP;G;kLd z(tzr*l#Gmgqa@YsTM~GB29x)4gdI8OJ*#6-vqxrhSi#%78@ivRX5TrFPJ?hJL7LB0 z%Cfc5%c25PddSFr`(w9!3g{mDM*AVR)3nC363L0=mG{UXUdx`0-qn#4JKQikQ3B`Z zsHG)~`6;-LNUCvC+{sIn>7%>0zmooICoxIn6^aP~BW~PnE$TgMpVBWH%U83X$zmaq zU4SEz+*J=gWMUDSaxih}=}8U3M9f~@X5au=oUgd|q^a2!>fXhYW`&qME)Mg_`@ey! z8g;$SrK(1@z~V^Xcg>HJ9P`??b*tMX(^lU~vyv%gj7kqtAa1I_lgX*)5_o@24v87K z+7#_wyAOKF%R-jnd-e-Zk|n#rE#yx!Ojv&GAmIZ400})SCrg+mgf37%*DbY|Ad`-@ zl(%vTT4_{Cc4bKxW6s^ZMmk{EUlRCzZbj4&b$4cG6HOG2^X@-+K=h{X1X5~S82XIU zq$@P4;yIc44X|xKoM#7`p#)Yp3n`N1LW(~0xxhZeb*`1Qg7Z+HGZtNRI=C0OXAQX-02r6w(`a;H^n)(oLVcZ{9|B z9#fYZo)t;q7r8#w zr>$rW7$I%$@2$3#nS{HLygj8_51}{zl%=PbcfXEMUvq~x~#8|6fP9U=xaIk2fnyj zV764Vw~>-1hjtX6z+?H+RUDCAwkFmzT|VaWX>N<6Hr=u=0c;LO*0jXt=Scv6sv{^c z461cNVc|#7AadR8)PafXXV-l=bBTM)r6IXT58(A zhT6wSOK2yuYn8Zbl?O5_XNJ#ym3gi0HJkOf`zo~lXm2iQKJyNT+izN-7PYCzad)HY zRw*pjfnirqx`%f%XWem2CEG{hEhr;0Pc`C0y#OGBMsuHPm{YscN8G7PO?EoFtqEFt z%cZ-8rcAuBkZ?Y4Us|ted2I%tZU!>hEd4UXo-3bQtBZ??1g>7_%FOQEZ(Q+@@Tnlv zrj7|El4RP4c02Qv{VTQMy~|aRglEkE0CtO47V@>_v#Mkfg0VMq*j9eGn6$YiduMpd z0Ctnf-N4A`Yg@@{d7}GG*;I|Pv6za5Ip?1BQ&NcC+P9Qtjmw4~X9VN}k;iYXV=A7{ zM2%c8zUNKgJu?39OTV~HL+mmcb2=a2ZBPeN56awh_N(49)FrpIHiqUKv{0U4NXp?! z7&*z!aTXfXx<8wBrcVu=K|r~-UD%XlE`Qo{lUFV^crGmM;#gvXb0G3lECQCv%O0dI zD=ufbH`h^q*uT5Cjt(=7&E}HNlbXtp?C|2PAYp)Yr7Nu+t1T6HEKqH~A=elm7s%Sm#>4 z(`1oSHZVIY$a^13*SAG0CFR<+#IqC&x3msV*B-YpondhGYNhSC^Kg}gjq ze4zUpv2ke}Ooc>l>)n$o)pqAgwV0um$yeo*c;x>8we3axUPLm;ilur3obCEmx{INb zR~Z_`9C1hH#S}Otf*$AF>si{Cm`{{i@d~`=#tBjUmvYlwu;PJ zOT7N^ReZPT3uNSD)}K9vDthR0v%}@x#2b;7v-u9;}D@zG*K36Q`Z&BD)xm_6YJ;seiHn7Bu6p%sUpwzGCNKU}92jDp8 z1HD_4*xZR^XuPu_?bGn8*BWq+$f|N%*SV>TmCI0*B9=r;VJb*Oo>pUT?+%%&P|8La zKbAqjV^^*06>S;|l+R2`2P(dTt!Y{%wZ+sC&myErA!xl^`k!H1NyQMBq&7BeCuo^u zSp3C*J~roR`^)&#HOpH`b(vnoVG|xek=u9S*jCfp&bo_94B|Ug$lKmeda0~I6!xhO z!a3cPY-Ib?Mt5kgZiPz*zCr$x3+>u*x=qE7d*+)cxGWLV%0J!oqfQMJdnUNvS0o|o z5BOD$iRO8&$sV*$*CSFnBHL;tblT$pVDZNps$0;>vRM{9sbKBWwB_A2dBU&vMP*(u zny(0pbCu3Wse|qkX$_5}7ZG_x%e0NEy>sndR4{6G*6k1aI7vQtl0_`)Pi)o=oWjv< zppi!GVM?C$tA6VinKs!ZjJDrXgWS~BS9%p0BEtynNcm$7!{;bZHC?BX_p`^Jl!POq z)tgN@E;Qtm?TL(TExV9E8irY|V7-dfL-|2}c_D1>EO^J#vPip_jLt5zxJePmPPz20 z%X?ELl#$HwqrM6vjIbW{74^-W3k)%xlE>C>?+R_3Y$wY}n|I7*G7rn8T$5rr;L zEm@-$Z9LmDGocQ{qjdw`t+X*iXkH0z#JD9>H3av`av#i!L~Yz1+p<5UX57XS7JcO7 z(yd*J)*^=b;uihsl0|L^RR;&!u3N`(2oi{zLdWj098>HjwEH?VqCM%z2BqXUh;wBAzR=ix_F-mfW*ZNYa8S+l|ES*|3WXJ51` z#ySqBl45ADb1$55hvgkuQv`xLcA65YwBzOgk}5`B-zs2d``mstehntV<`h|*bDkH; zlNmKvT}iVmX{$D(T4qOJr{!fIbbHo>mXVqFH1{1o=KKn4$nhk0rssT+%2qRsvX6Xq ztv@zqKxTMf>j+YR8peCw8YWX=W^K0MmXCmb@f67)b`|F=I}B7e*3cOl;1Y$-a2usj zS?)tW$QnOF*dysgV{x{Nay)y6!y7pM9P{~9vp^-WS#BlRRObL-AI`Gl>mftQIqAqA zPiP9^XRkt)=oPSO!l3hrn%4lF>#{f#C zgZWj7LMN7y!vJl+IEZ0$>?=fsW4kPpmH?t2o$BV>SOTg|=Ds!BV{>cUX<9c=9b`jF(^gj5)9_2SmqNu#VLrlP!}p|8!$85%6ZEonB~vR zTao!xQr@9)ZlPp+{LPF3^s5rtTzTGHfCVR>KJcs2Mk9(y-7W3o#sdgTzyAPQuIT|! z7BMhOnOqj&VK-Ek))7Y`BGIm;X8BdVP^0NlY#(exXC&^+)>n)LT9Wcx$6c__6tU-K zK|hT(eStd`wIrNvNTx9b9LBiDY9>kNRcQ#ol2* z3P}Q+jkLK}Y=#ftw(yz9IK?_SNxR&!l2_k~l_X16b{6G&?a9p$-XSasCqJ00)OHeB zo6Pf0Y#_BtgBe&xVQ~8&bU81hT#4JwFRp|9Fi7t5qAjP_-f6)Y*LPdF4ng4Y|3XD0MjJD z0m>`<>NA1Xn`!1qq1!0ZHdO;;3b2AWks~>);bn z-$)uLN=f`jD~-U9dbfCO<(4n9J2H>H?}ZKY6<%*L9*Y|W`BCrZrFiJ;@YGh2$Hsp-(5FH3+a00Xmo5bHF=kC0VwglsG4neqF2errJ*> z#Kt4{OCE86D#9Cw+?J7UkQ@R^;Qn;%&|c&=a1@o1?aKrANq#>MTDcO=B)Nq-TXF;W zN~H2V{VFMq)uhBs=1C9&p&T~fZYsQ2-(a1t5zFocS!Ec>_ouj>kg&{>yP>{$Ko|^czMfxLZ-M+!T*<%OtF$tM>X- zw(@zAwi^xCBBOU346d2aQlpbiRNa7q!0S@Vxak&YWclWZ4w;Z2=||Y^RZ+7R9k4%{gXqk1yrX}AV`euvjCPwgljkYb=&I5OFD;VI+|}EYaUHuzvO5gpYREoca68pV+szxK2P%34-l=Rw!H^Baj9^t<1~?u1 z`_;)A-7qnnVxdGnU|v2`)Cy<-DMeNdf;)OrOc`7dRh04w>-f~tA25;8vGk~fY}f!( z6@?On&l``Ssm!7BbM>m>GDB3!8Ig9D=kIz{#z))okP-2f?UCN2AwvS3niCkJ-+Gk}H)Ewo5;`Etll(ZV zjGLieIj9jBleZ_=G+cHp`MbhzPm~TlsnW8>%Z>OvACS~+jEaOY9^SQbJE9|GVaW$+ z#Y?)_s|*(n5`u6#V?1P43x|8DvRERVe1PNwSKY+QNx&zJcKTHYghW6CNkrDdSe4V)uE;zHpWQZ z$WYs`I{p<+1bZ3cPu~d`<%=^P#c!Q@O8e6;C1AT3X;xg|jOL=ja$N6)Jl7#3 z8C3*epL!#`FTZ-654zomstm-UC_kNDfJ8jDP|Ql<)rjlsQq_c+BzFWkb1c zgdFuf>pJWA5WLFBQ2=Bd9JYF6=~JV#lvO$N6cJL@$8erV&}Ku9dWFsfQD+*6DpE#? ze)Nas1&6Iqe91Hh-M@979fx{qZH6RQ81nMK;;kzoG1`1_TthNAK4LZ(YiExC07F)7 z?;T}pcNY>$RJQ%RZT3CtFSVH1Iw}%=@$XdUx_B;HYjGw3;ZN5;l`dA<7qM-|A)K-m zNQUBlD(o?dbhdUy3>RvgbBf20T#!L}WQ%lDBOyW2cXU67YtXzIVFllUZ00a4NLo4DlriR?p6YQn$EuJ5h<_zsf}dN2F`u%i+C14jb(I z`TB!iVv+%c-Sd0bZGYlxE6p-^*LZoYS}ml17Z|QC7#GV@7O@r2udO5nOIvyGCfN$J5^zB6UEY_b$D-SuuzQB@=})t; zw$Wh=84q#%N79&NH!@&z>T6XVtX2ZAb1^)&d(vIpt*mAUapoVq-`)QJ>(boa3t0T9 zux33^_iHaK#OIUmK|@OCtvek_G@u$nX@Ga{NN5y{agj_yDzvgHAS~^|k5gSp5qM)s zKXvb?ALsBD29<9Xk!p22#l*XF5;s-PPsG+0<>c4*O*E$@^dH?GlmP)RBbSl4I~~kV zQfYYUN(ZGKDS(ELtvj_gfDm;wVj8n0S0H6f#T}Og+{{TAn zTjv-^epEm0AMh2-V3YEUp2EA!q5l8~?}L^VU@j#AIR5i}bvr94z?oYa)SaNtvc$n~ZG z4M+zb)huY}>;^n5Kl_r?d@;?H*!7TO{`sd`5BQ{|_44G$=S}dy{{Td}$6qvmD#e-{ z;LlF9hk;JU2>|qC{OP8)Mr@3pDG2waG=Z~&R3V4(%`Q7odeAdSV0u$UIjNl}0o2-B zY2uiL2L_$inNA1NfF%=6iY5;0L*`e$Nq~vGrEWS>8LO>ne_)@^i<@aVQ~lp+&Qr!~b4I?3@&l&c;gyCFiTyL{ zSvrl3x^>Zo`9sP-_k|!tS>k})bg0zib589@WA|o^=9(Ie&;{E&(`@@&nA@nW8GteQ zkgy>0n!!DJHEPc7plz*}iMo&Qsc59mn%?FssMSHi=~Ojs8E+RXfn@_I_@*HWR(M;C z?JTv(%8y6qlLTW!I;A9A8&VRUTw3h02QRI1tJxYqF zHQm%px>s}`?Y1g8R^!~}4W#e$7-a)EP9iauHx4E)=PCs?xPZ~1UvIs z`qWx${P5mHvmg0pr*LoZp{~LorRz&|`-_vdZ{a`Q6}_hbbqK~YoxuM9-zL24EhK?d z?(`zMKMG#VHR~8I9ZE?1LB~6~a%kr0!<6dvv>#`M@#xw9RgN1c2lK9vU5R7U<6q%Z zwz=j`de)KHmNcy~l%AiJNt_C4>BTTA$N^#s4WwY?b*zi{O!7r-665AOYoTDQFhH!r zplcAZ18(*i_4#Qr#n|TW^m~ZVx}Xi5n#m~eE`Q~wa#yhx=@O#4Fdz}=d)GgIqO+^W zsGyZ97;}nS39e(2bVhA~_V+w>`c-+NRUj*OtJiiHF+`>$ZcohE=~U!(-Ho3=p!BOu z$;F}26hM5|lb`OK5l?MGR8o;UsvZb$n3{!*jT=!cN8X1 zB_D$^QdM~Gis>|+8skpV?hVYbG6}MLuucJ9{Bc~E6XlXMY=OISbr>CMw$OZ0cKQ=U z@kQlDm^Qf1{IAT-?sHhx=KChp`F>^=tFPZ&+Z`?m)B7~d=!DH%47 zaoo@(!^w<+o+xR{k-TpHb);gFCR1&f?XD(@MwZigmt!VIbMp)ymD%Z5n!vPJZe^C* zDK>uN19Y24RTv-P#!YcHSHIbglSAh$j1MayA1VAQ{963eYA>mq+qSS}E2wn=ekcG`4?J8w75dp7x2Gv8;Z&2x`5@xm?q=~6uB$_J0FcACf7 zPPcEOENCvRmN%J^d6AfX-a4OZ=QRyh8w<9vp4TcOCMhwtNb*hyZuOlu_OYbp)ZOsY z#+nooTq~G&yjWwGe06WfIISE#lu=zJv~ejzvKQdSxjRT5zLmvj8t0L%KnuHWCuZ(I zYUv|mbEzbklZT33u>>RTDEId?PMc2UMxth1`i=bB+E`68{{W;q%^+sxM(y|yO0W^X z;WE%sz{1|E5$S+W^rq`l>C^b;)*U?=Cb#nh{{Sx;1A-5JD2`p;HqwT3x6YLR0Kj8v z=%KBdokzI3bg`?Mu77xpI0cBo&VMS}i&3)jc7$J(61oh4PfkBN<)Fh`$G7{S{*`Xs zl`$?!VUR_7lcO0cvyK&|2kx6P>QLRh25H_!iz8!y5ji`)pLBi{$JOGAHout`VpLxz z=G;N#np=GWXPVvTWLP9tViB_A1CP$4x`yU!C63z6t46!!mEzoyh2-bl*Pn^@NolFm zPufY{lv>7HvuyVE(yC1wq!38l%H@b$A3@aCmam}y0ApDdxfs+u#S0S2yK3#p&!^pE zU5|!st`oyrW$mq?xsu)>TX5&*3{PRJ72;zA=*Mj#mMh(TB?+`cwftt3u6jG8`p*Icu-kH8+3;+sZJEeI}MLwTP&XXS5NOPTIv zw3%jhHi$uR0N{OTuXS#aMttarxC+o<@y}8I^+j!dKGgQJNL3MB1W3yBoOZ1E?B$bDA82NRHI1Jky5M!KGYtBa(#~YrwnF948;7y&PKG9m zP%eDMNn$%=bmfQC_NnKxG`)xbk9m(fkOtewEZw=PEoFHmCP5oSvA+@yjNjcgvAC)> z0Ax}WvoHF*`cu5yf1P)hm0W;(FykbCbupVme61MKi-F`BUoE({IssOZ%gCq{6;{g} zXFrW3;mF;LsGMh;}NGvNoAfcMGhDVH(AuMMk9=1n64 zzFFztnx?W$(Hwwx3~hR9!8~lNCQmc?KyIG2a)M=n6fWD=L|hCKPd}AQM|)a!Rb0(oVfgFK?)!4YIk48TZX~a2Mvw7C1jaTM$of*6}JG zkvH98&lQn(qg+m(Bn<7?0m_<=&c^=2GPffUXQ$rUeepwzw&a}w)Gd(XOs=UDCh<_6$CcBWZu)G0CVyT2K?(qrBqGMK#v_OCX0(I$!=gO$L0spqG1_x zm{j-LPIFDYcHIw>FqqxNOl@RneCEhrnH1CRS|}u_w;PZMuzD)-R>P#bnfY&S)mq}% z95Y9i2b|Q_aV(L@%B6h6({2S_)qyLIV1^e?a98gG_f1!|U$Mqrlf`xXf&_NuEPKGHUxXvHkZA{=;Mnqo+QZWf_t{*B333gNhakN93ivZ!X`Q z;~`tqHH{vn`g+paDcQGyiqV$l+(uBhZ@Qze zwNa4UM#Jo>Drc_d-Osf(CLIe_k+TNla>R@Z z(}o{1KQJf(yPoxwotQ03j>`;ckgRI>!EZ{feKe4vg$l@7IbhulO*~5S6ooeseqvjJ zQA=eka5s|n%-z$Y{u!i@<+X}d^UM)^*O8Su{0U8Ir2vPe`nECI(ARc<`iaFG_{3e}x&14(dZayE8c zb^I$`Kq#xVV{&+@lzByQv0F?Ndm!AsA0Qe>nk!}rBB@u;Q_gB+@>#%qlb#6eQEE^q zn0be2BPvMEP4(2Zp+O4}QJl6r`qQDchUQJETY2sOru1P^ef@@_6ISX)M6 zjC5rg$K#5ai^bECcX`M7i6xmuItj$?iS}o>h9UB+vnsc_#Yoo$E|rn?qOV2uq&1-! zytj@PNLzgP!Q7__O^O)TP1`I%iGAqoIv;9$+KjS-r(i?S1=~?F%_#vS+&idVl+s8| zmC*!?C!S=JyEZyk=@Q>I(U-v^kFewZ~7-zP(lQ(D-`yOVvWZP-se zj(MdQJKH2G5yFr<)NKqh?Jp9@SJt4CREu&)Z;^wA8O1cN1(mhAyxrxg2)v!E#X}Oo zYZyZyFP>W%$*Q)~A++;6*#m|-9PnxJyq68SC103qaO?8@YWZJKx+juY`tRw@PV(X) z&DKCj?#=Wc)~d>)-rRXnd8$r4;rCSfRbz7TG20pa=T>fdQ&0|JW|C%+U&>{LSRXHT z0s7UM7~Mwmv_(9jmcihUYM~^)YX0=+9VyoKvd*UBd#J4=2M9M3?f&YHYMLVK(?@TnQdX;YYG&aMqJ~{arbLh(@?vGlgdSnkIKVfW*-L~ zw+=hiDQjUkjIq5%f0zv8r$9|PeA%RTxK(eHAp?BJ)3sDE#|_*{W001|-r!Tk(Pf(6 zJLiT~Vi!17ap}69mqA>#vK0)j;avt9e)TnFaLE4vJAB`CJ_bE%y^PHyjT=N-Lzd59 z#MGBpLO&){jAR0>+ymODt%h59(WI0n`4A!6LF08-HxZZG`<2HT2Z|!Rcef$V>~zTH zk>Gc&!n&lornAkDS6m+W*K`Yz1 zGe(2~@{|0}N_?%8%ve~AFzy!$Qa_mIs*h@`JhFN0t0VbshQkaK@1FG%ea*X-heA5i zzC64?%&IV>&{I*c?m|MefR_1tWb;f8z&S?ab~vd-C{zU@Po+6+Sy71_PI`mZtd@or z+(R(O(vW@fRFO@bynV8xxS<@TNWtTSpSw=LmZgcAhwmmBZK|ldd0~pK&A4FX3Xux2 z-1`8{Kr+9NxTwOD8CJ+1)P}(nqhVV#7GT3CZ6a{k$<0kAxbjj^$0x6#rEw$%Odnc2 z(0MeHK+6}H1NnKta=d5LH3}Si(*oNNSJTr5o|9q;ZO%%*4m}9(P@wY$af*J-u;-Eb z(i%bLFjL%g6stGP3&n*Vw5T0pBN<%OjQf#O%Vj62sU3&MGK>sv9)#1OmuWa(o4Bfq zHf7nd?^h#0Re0RJNX`umYUAEjW>n-c=qPtsWI1BrQBF58yGOLJes>{WKRTx&6I`;l z%DKyDwMlH8%oG;L8TP14mTl=F0gf1f`qZgz%$sq+`=nHM?CTqzHY;sxjEV%E%WI!jyl$nlGqsy!?d>1gUcrw?nO_f z?rqVpEwGBN7jQY^uV~6r{{T4QLopxl6IwllwPtw2?j3fuUXDoGEDx26p1kMUsv?gu z*B$EIX9{EBXAMMvjus;2*^V>8rraxS2XNhzxDC!JrAM2dc{J!@MrH>&Cp=OD30YA> zu6?uGqTcF0@y2uMS~EumW+1kIfF6RWCc`0&m0~!^=xMD5MZ2bR&%G;>UDGUMWx*cQT+QlIxtaxo{SrnS z_;{oQk@P(cdgsEnX}k}&pWI!_hv>i>^0RFmk+a4=P`Tg}UXSqBVPm4|p@SHrk!CnQ z-cmoMUDky~Gm`PGx7j>pKOZd7M=u`Z2l-YrG86YOJ6CV<3ei8dEUuT2HrYmc;~zD1 zk7y=$EUHQ540QTYIa!dAmaDk%GCk_ZiHtkS@I3}<;#jZv@u2}7t(E4XbR&TqW_s=Z zwdfYskm&Yn8bue{3jEB1q>IVXZ6dXXIMQr#BhEA4sAfy4jy_|aO=}o{jK*mQ86^GV z>rS|lEMwY(Gcf7>+I-R_f(1|Yr}V7CH?d~rk2&vEaIqcEyMl3=wjM0x)1R;_*HEyOOsT(3CM+ck^Kj!*0eS4NmWLhXaYkR z!|mz*?JzSnokROZ@robZDfxZ<>pU-7c0VdyP=HKvO~pQwpyHSeG;|c~Qe!582sx*Y zDFDqs9AbfpIj0H$r(%E#K}tBK#R3!(tfhD(nwndw?i4K2vw_n+sslGP(SMalgKvZg>-&TJ>OTzO zZRhF!wSSd!nw<2Di68y;{&dq$bBUw}v$yl6lUuEfn8qVC%u|Tx(wmB65S&wTb4uQn znWiw#C?|?LQ%Lp{fOl#_Gfkz>q#<}B zj-Hg1xZ;6`X*!B^H2YEkhl)UHNT!-&7d+F{;-ef>fug`nr!?S2L=8JWl_Mksq1>c{ z*pAiI+f5IMbk?<;fpZwhZ-4JsDyJf*x4Dwmbebl^FY8DGXeG3rp_E~e;M027O>qob z<%(WgFZxV)E1x3+)~wmyJ+u0u`@)k$xvd1v18wcxRJWGUU)!=ehEg~^)}W9i zv56vV=91QL-i6)AOjK>TdOeO_{{T#DDUS8tjr?SFs$fIAju`MbuKs&?=7^+;xf`u= z`i7irCEmF(e|&OQqEYIGpK~?tiV!~QfE2o@>Gbudt^Jj@rKjB?m3)^Dk45s_aeKt5XMG`()yQ9>g)m3VY<&(wQY zGa5*dv^&bHe-3!4N*E^(3v6{c7_B2aXhYQQ*o@R=s*abdL43;;fX8qLAoOo>TJ0QV zH*~FTi7Jv0ZfczNH!XQ@c95}#H_1J{D?p$XyJv44{vByqK6p#ugObG3Wh0zmaA^W$ zV49eWk6LL=ntKYx3#~dk$jowm)BV$m=kEMA(U$U6K*m7_pT@f4ocdEwPL!-3r<)xg zQ4$@oGZyT4s{a6BlsWTU24CUNBE2f&-6mc}53O@ror^$Bh>0L>>^TFyPc)D?Ca#Wp zSk+<-Pu}2j*0l7lG^sFnGM=8e6+&cpj2A-9kCgffy{BXQBza@xW?~L#y~&P@#gJsa z2g|z^cSzIURMaE2i4@2|Wmj;3`0k z$u+Xi=cA6Q*dXBg9Mw1_niQTWq-06dETMmiLC7DSOk>okNo+tOl>$ht!IiSRdgi;m zJ=0RNjz?Xs{9`<;R};QayM(SA)EpdEzlLvTx4mm0RK2*;@7~@b zr{!07VV-f&3g`8k%~I<~WP<6p7T#3%9XR~!g<+>Cdqng$bz-H;&t21EW25P|_JN`M zHN0x`M2uPXujO3suBkoDlT9tdTiOz3p96Si;Qf1jE4rB6X*wK!Op)WtNFaCyI!9VAk$Gx$+m*$REj_I&BgMb#bJjaqknUJ5G*cHjfYbd7H=T9?RnBFk4l6H^D z4)3!hgWUG2mv(kfXKSX~#XF=(!bZ-~ESb&$ttGleNn;YUf-)7HbddDMO)@LXJiMzK zO3m_zaCrAo+Nn2d7*2O(0ieSaFi6O$wbVw8{Hq?Pk3m=OEUj(8xVMOJBZg+}ii|s~ zf*}zxPjH2w1e8EI#az@ckePC*W__hC*g&U;mu}QO0q^NFp4VzGctb zL61x+$Y)>t>1NHQ)`f`-9RNfU!66YMZwtH3+Y7<;u&agwwfUGgaA1=iA{3~Nk zib$>g$8oSv6K{|y+|jS2Rk*uqCApyTN)|UjLaF)Ju1B>n~J?^i5($8&!xa96^D5zz+o)}*6Jg4gBd<9IiKa<%ic}yN=e;)aAF@aP03Q)6pSrAh zH9=%ydcGePJwDJh>0nTtU&1$0V zk1G+{2Tb*>+oFaY{gSU}s1Kmq(UD_y|W z%^Sb(OK=CJS1pR>xkYz3?z?%;-SX-Xyf&$}H8^6e z)|$CaSJ0sfa`@yP52Z~aCA34zkSX_S&yD7{xG<2zJ%FfWk=pNULvYSX#tmu9a~+Ks z<$pO9n|NlY-!-?}{I^HW+%CiSs;#`Grulgzr!`{FOt@?1xR@11Z?NS-R zEF)s9I6GghN#>a3V4+=e_g|^1^GuH+aH<=i9qN;~G=wR=8|6^vxu_%G(p)dnq-SxG z9DsU_)m%%A{JE_nCAEpyj&KwP=1XXf`GX=)qTC^`SLf8`fe9~9Y zD9u=A@`+N)zf;X;*);LUsEN1{)SA=s+&FTtpHeF+=*mj_6)tW*$+A?P!3Xa7e|zzw zZ9*s6E6n2I_)rHGEzE*A*cX?P-GdLv)=j)Ve86xrkv;KHY$N^H-&bX_H1lQIm>}=HQ*)T5cn$0*1x}QS!~$44Rp3;ZUUv-e>XoohfWx zv@1n4uMz(MMSZ`-$fV=mts=C^fh(Y1epAg!YdTvimU85r1RM&1r;(;5B1H%LypvMy z#eFPTg(dO~mkg`7{aUvyY1X!}6_JIsjr>mgM$t}=;Y45QPJE^3%Gr`lLwl^7&4{+K zTSASUv1NCCaCxaovXz2Z{JX$-Dn?YGts8ZDOF4)nBaH6k)fKrVa#3;EZOJs&ca@4T zc;i(k{i+&lcS6l{mkwe+Ve;F6{k`d`AQP+0`{2D3wO7BEN3~eh_xe8U74{?2nWmwC zl0vFF+U$P$ko=?ARTzkp;HY5bS9Z{O#XDqyN`N>8ut#sG zs`1VCY&=P~%;AqxeY08e*%xtIc+33Q+Q;Qy^wkRxcSdKCk0&?__*19Oe8h=HM`Kn0 z0C_YFiz39skV^GF^>IdWWqYTV+vhUxRy{iAtH*yc7j$4a#=^M-RO@RmmnWMfZmd3K zPin6Py1R1tUb(J_rD89sZr^z#Q;^4u4r<(NZSI+#IU^xR{_k3?6G)D`18rl4+4sJ* z>6u~N+vk{W`^V;C{`aLErKX0;mVidGIf@ebAP!jjnx}I+d90TC5dpUY_^H%oz<5I<_$g=Ew7`ko7Fg>X?*raKyr0|b4Ew}Cv{f0@TPVe3$nGx>&eRA4^lFle}n@ChvCXp;;b zKyloUYO@v8AiQe3)Ft_pTe7d3Ax4uK0nP}`H2aB>7culq zf#0=D1;RHHVMBD{sjLB^b__$I`H-4na+n>*V^?dDdlz!;BC5+99oPy^DK0H;E#zb% z^Vc;^+_2rknvo}VbZwmvBooa$G$zv|faGKq=bVa@aajN#Hhn38MvO#RRQ~BXrHQt) zV#m^{G&Z`8?yMM->oqM@*j8Aa_+iKLl-L>}pw}A)hTCK+fVu zDhMHS@}2hg^_FH0%#^Ed#5R0fFdO)`yLe zxly-~hV4>h@=gFe@lHlggmpRR)~T=T9F;@Cg zwScTiE499&t;3d4j=r=MMfqJYFmQcpDr#EEbY?aCr zSfFA6+4HVi00YpPs7AqYk|+(>VDzhY3KgDC ztf);=eZ{>&tG0WP%H4!#IjzbwAo;EscPApJ{vG+{NX0{F12mht^r@Htr1Ywm;DnUz zAk*RA%H=r=&MGF8%mlVK3P|QiZidF&@_~cTJ?ccrgYG-`hN&%K!#gtW&qGdvHAZA` z#e;fct)0<<`<4oN9B^p3u2)=?ZmaATnz1XCl(1z3sH-Ud04`Ndf0ZZ-?zrnetB62#yU$_Bxzl)0XSrC}t1EbPkoE!1_Y>YLqx z`A4oesP6BU-c?Wwe|?W?v!+=|so$hbpSmrb`1`ctfRj(t?zJfu4Yp%bhLL#{*~Yf| zZMcHjK#&ekdXTNYmuhY7)k7b=!m~hYmnyrkd-pZGV`NJkB1v^8$k`nn_cbdRw8(t8 z#%6ACdwZImNfO2_qXtvZR$~JzmvfdUB-JsI9LU9;vd5{V+6_yy6o!%lA~8vf)6F5` zm<%Hn`)gS*Zj#<1mT%MC(*S80Dgw^OfIHVy3~_jGN@=FUT}V26{{VDQ1bT(cS{9m? zHjRnha$CQ<8szd=&~`mV0HvhGElePsEW`GyHI+!WlMKKBoKGdve;0kFV(g8C@ao04Iqa@P-eJC`U z#V_MP0PE9=c&1ac+K>+8wH#*@oMMpB1cH=PDZukU5@S5l(?Rv6CyGpH>D!M9V1LiJ z{&g3JIJdb!^w1IcRJS8nA?Oc(l8J{cqbI#X@JwdhLFb&X!_C$c4=4y?LahrDDOZENNICH6ad3aVN&9l zeP{ugqz0UNQ*nv_a5W;H-Krtdp1o-f4V2zYt(YHBa#RO)TBe16q)PR=O<)CiADeP+bRFe|Mb6I%BO>a4g z_~mmB8*0>u=1-jm$~xwd46y(Jamh89>`-7(Bdcv-a=7bV2AQsh#5Zk+ z?I&jTHJFjbzvSDI-yBmO&^GOe#m6TU)T~b_+GQkSMQIX3g7PwYt|=N9MchUdoRf-a*o$1M`hVG`+VTb+co?efYN7JH zEm~Cb6%a|5JxSmSm5e~A8>0uPrlczw&jJL&`_*qwf$guXXFtT-mmKhNDodawo!GKr zkIT2G<5Su;Qr~XXHuLuDwYdI&&W>TFsjYKiZ*>D3yZ9{Ers)AB)z1#vX|U;fm94&~ z6~f&Sx>SsbBTzu(9y-!XK93!w$*Wq#>NpO&yyNujD&C8E9;0V*IwRZKq@TMD*UIB) zKKU80Y1EW;I%5RQ9YXt5ipoc_b~3YoHYPH?GlAZjV;|U*Bd?h>Zi5bg)<{7u&$cQZ zS5!80LXcX@@k@ltzE3~J?M^K%rLc}D?rtQHbD@yrD`&UiR~yx(Yi?9pgswkXwf@jA zJgfWY*X0X=yyu~=LsW(|)%0kR=VWf|&GPOs=K%eBS3y0*vobhWW!M|a3&Hilsy9$s zwf&5@mUg!B!-q1N{&)Gf8++BmniTn26SJQ&>KYZ(YY%mCAtup((d7V{JAdreNhceygZI)zcFgn13~t@Fsi?IrNLj$FqD4s)cMH$sQd^cpS3#Fv zYFqYOt62#fm@enQ=V`8GW4@$YS{d-#eXB{5>TUMTUMip1+TwkPN0b3)S3ujbLRK4h zDBMjX7eP5Etyozdquk$go-y!(%^6t;j7&c`K0~ z_b6AP?OC(hG!d-HDGIy*KsYp8`Ry9-keOHH~or(jJjRY#O_3ZWE4FQv97M(Ydr(Owii!t zx@*GTY=MY#{xy>&mXCcS-G69(oHs?-NEKc5>sM7%s6F1#mIz~3eT7d07&fwQ!~DaZYP{Cs8-#ae*m%cceQ8U1lSZn}A3S5G zD;A1ImBq&GHWlVRRiWal-{I58OpJjG#v+38z8LbUP8o!dbe&svV=+B3Ba0N=#8 zz@je0Wm`0{wYd^pw5=;3^3py)`qsMYFciv-z!Df{u%@?`HejiM1~NS=?w?^UGRy#D z_gjy~sjKQu$4@G~z9A$#H%xRDUf$e>XJx~5+-p*Jf3vokV~neG!Kj3hGoq?9yN|lG zB8c~F05QP*DeUs6C|CaZro3*Ox1&cNqZ*B*wT)Ce$Q+)houa10T9L-kM>{)35q_sC zIQ(jBlONgGL4*?UN^w*#P+Uj^S0ZS$)JL|gTuZ;p&kDYRtt*MLWw?e2!orBtFW>pF z$MLEXJdZ0#vF>xueJPPf7H56kYRsr*2>xcp#!BL<$|6!l0V_!(p|@S#D@xrXwp9_v z`?2#K*yrg>9gD=EOSCI<#VJT)}?6im|9e8r#Ik2u07_8B?EAS&}dcP6-(4=~oRx)*eC>-f@A{(*r9?5kSRL ziO3|Qaokh9#tp#5ZlX$>xir7KV4!!YtmRad&RA|3`Fhntts@QZ+pTVC>u4cuk(Om& znVf$rQ_%_T3p77}VM2PABCE)=fXb2tNBh2{dsW+IxenVBB?tS-k}5gx;k2Bk!ZfmW zaG+KlK%i~>h5$x`kD1$+G>bSB1w2#rNfv3m*fY0i=BY&crv_;N0B0m-v`Xlcqp`!(e zlT0seMg*O3dHm`N3%9j=#~N?$+cz4_NxprBhpq<|pojz?XfDAAS59$TD5$Fq&g4zB zC_SoKC5}kEu@eqDlhUDb=z!C&%8@+$v(a6k`ViQfe73(4?)j0H`*Rimf8aPilT@2pS)6is&{1pRAeaqY1mg| zcWUdg5U$pE=0%%~COz&y|MHVrrtrCD@9}-c!CrMG&tKB#p<=X09Z2Voxdz z;97L z!;Tj!1yi=PlI}rm6Oe@DfTN{o-Rbc{LO|%N82O1gKU3*hGRpTetVFDWIpAy`lPik3 zr^lvWLd&{oCi}qyZ3;d8YL(I|+ThO{I`i^_`C_1rBeGO!;UVN1UB{>8RHLSZ-95%n zY|MO>nNiigV4Ur!ZURK>5(6uD1N>@x_#&GUN2$Rf>ExmiN7U3&C!Gq$&JsxZ2iwxR zB4;ll``%tIGZ-8Q#~J$i(pj=3@}${}s5n|K!^jbpi35gW{i-s&70!JrwS5V`q@~*E z#);*QqEK50*wHnqiZ|Fdny%KeKko$mDb5M2a!nXa6p@MPf&lz#9)+z+NekLcB-mqN zh6^S#Dc5F6Bzc(TM*I75SE4XFX6Yp%hSVrQQTd93@(XV{JiBpk8;-F)rQJ~K*I(YJg&@ zu!*G>sAB!+-+!5L?@;91vK_6f3y6fXTEiQ#f!B<#-gn??f<27H5RSw)h37k#`a6ii*M(QNz zC)Wa?woUm9BFK-B460Xg_BAS7v5@(XCheSNhAl_Eql;pq&sJdd{+jlWRe8zMn90=~bQFkC^<` zB!9c)9D~O-6p#JklUEc&5lZp9tw?0@(<)9qsgW4mob?@zNrfw(gVL#ziK30qj@tz;0?X=(yAE1ZM(6@ZuBCUtmHP)f)9F;4%u0UE9g(@ zP_8k@J9x!CSg64i>>>$OM8{(L;}tX|nOGHEpK5Vt*bXu6M(i&G2AW_@WMHT}E>Gi4 zNr=wY>N`_HZw$S_s8$k;cVzL#2NVgG3k(gXKGf3uZ!4e2kwZnmAaHrkYD8QNH|0>+ z(1g6ZcRqJ;GuodTExd+7BC{XLoT&g#dCvnUIH1KJmPkeSJt<#X50|M@bP`5cP62Jq z4O)n(gXP)}wN%V`VEcjf>MBcS1z&3pojq$usuN_(lx@ZHijcS<<9Qj~liHw3jA~TF zxHs`0)hLxpy6t?PovSMXWbn6`vIXUU&OK>2fJV-sXPRtyjebwP>S?kr+&3&h+(%5& zVT~@p=Of;oy@j(m7$*Xu-8-gywOE+?au2;eW+`qHoH5-^AhpVINJC`wtE#9Ky~O?WK~pxVAA}`W5_C?V+XxRB`!0uf)3`TA%cURX(3$c z^i7XrYU{?(Rk-iC{{W3ZY^9b1kM<6RMoLh>KX&O)4q z>&;zBBDRC&!I1PGl*CbM7sHqf8+bm~X(AMmVU(w&4a<3TisiVr>Mj6!J-y&0xZ zA${oy%_eC_(t(TGoH|l=rTl0RqMb+u9Vvhlo|L1lB>>U^)CxW56u??T#V$=WpRE8L zsd{_UX?oBC(vF#?(w+qXC^V%0bb}&;)KUS|XT2tSP&uXp>BBV9)|@E;AT))#(Tvg` zx_xLGI=w(`@Zlfy$p`YP-W>k`k9(i`=s%TN)xKQ?m;IC<%BuKu{{W-idpQ18heKYb zHQ~mgazE*-e>$$ltlxjgO6Yn@)m4U1TG~4qv4rND&4EfrDWugB8?)AsnnE!_JWw%n zj?{qXuN2%;{{T7w4IuQTKUz4T7<8jNQJjt`3B>{wb*TZUvrYz($zxseDsBt8v6I@h zHBS#-X?Fg6uN%jZ2~OFpW6J>%Y-mUf7%@n@}NfsI`K+P5=QV!(80(u!_5rwT$I8v~_CFnOq| zMLc4Gl7mV|Zq&i}Q^zB%A)N+^sVvBBE{5eJY8LK4yIK04oceXZ9GG8}Z|PJ=3>ZY-3#pp32M1Pw-I%JVuhr@YehgQ9}d54!IJGuT64{y%6*7MANHyIsk zLqNHUMAVw;BX%-Ju!7tUbD9&gCF*t;(je3(MLiC2?Nx{ewQ}<6cR$*xaERMgHnP8Q zn%%I}BE29gIzBV8>v+vGHj4oB!c($X@;Iq>?(I$4kG+vpA%OtsC}10mRDgx2@W7BL z9m>lY+$)RI588Z5ZM5WhuO@zgn(i5omf$bXuPzqar2-)^)4xbk;2cV{hmnR4a=4f z^5pQjBo9ifHK<1AB@7PCpGxkoETD0`!ll(t*2f~b`}+*sN9IcM@B7auV_C{A70Q~> z=9=ACVCsA3n)_eO3yg!v?OO6$#|up}Cg)_@6`SQA>MD{%o7n>H<`#Ni#PViU$sv zKnfAgI#tasWEzA(#5PItAEipMT0jg)tCD%gIW;}Kyisg)Ol7+J(Q&1y{c7KJ^1|^M zgG#DnjNrJ>Kz-}4mNK@fZE`+j2&JovWJ>d!%$h*u5aMX#BXXfyNhWo()%zzqY-Hohs^2Ix|(U((Oi;YI!8(0$5 zA1fhth9P+b^fl!^G1RBGKw(|i%|DhDr%kE`7S)pj?!ndc`l`3Fj_S3%~MGQ%ol7y4;rxAakMe- z#Rf^Q2k&kz%!=VmY1LD$M5dmoij6m+0@;XcaWt(Z$tA?7Fg=DUtZU~;+uTAW;l>Bo zKU%#g%PMeHNx>tYdsRcVzy(-!-=A6*rVN$ZITA@eRbp3Y>A>Q$t`=pyM~j6DT1MNz z6{7OV<$yj^0~i9Dle$6#aruog+7$7+wT^~TS2I@ax9=&HWAf&eZ)OfwH~F#6PxiZM zp`P7?M#?vI{vrH1tX;2j^EnH+{{R8ooOY_*Mf-3gKRaQ!V|UhqVry4_l)Q57T#lnP z1m7aQ=fU78&))T>xi<`^X#`-fE>B_WOa)n9J<>2BDam10tfacXG6-bY5bocD`Bjtv zD*ofmQoGjNx~nqwB-M*y85{2ZHLfIvZsb*Y;%kN%T`jb#Ra?dfB z3QI73%6ZL1Zg9}LT*ks|DsOIBo}5&-mr>gj5$57H!(bHy>FZ4*n=0PfrnJv!}r*)6|q#k-1=a3M)A!5~A+PlEH2v*l?TJ0^4(nsWkEF_dmH|BggzqySxyVG{uHAvbt%NjGfj7Nn@i_2ZE_p<(oBApX2C=oj`++oK~9u=N#g*&5k{$1d%sD$UlV-65o|K`d_t zo05*@eo_=*)evrO_7;;WqZw|su^rpZJ2cJ*UR?F4VA#>hEn5KboK;2KPjcE2SOw+1O;`)LK-sxZwkn(~=3$R5(ND@) zWc@`?ajHjb%ks33>5K{;*fd+{oT0#|kqF&_#1o!QaZ{$DZ8S@<_m<@CUKiSl?lv$g zsu7pw{wjr`NoY}x<@-zJzE9y5dN7hJtFX!Ao|ME9ZcHu<@-x)cElNpYJ2ogEc}OH0 zt6xE0!xr*eM4s3frG`#E?lJVN$()mf0-PjgWAhX1nqwTCY&*LVTf#Oaw#8UqjmQrj zMKFX@&f!mu*>Vc*$El&nXqmc^TDM~)1y7Z`HFrvud1es#yCPqiu!E8Hs4Ub=8!K^v zpRFO?6B36R7&Q-F4P|>KZcBX~xMG00#sK%M303D~ET=6>mnJDxvTk1d)Y7X@BgljR z-Lp`k%0}a@wzMZ0Mjyc^Mjq8&;u#@^(Vjg8Nf-h|UAaB#)bQ_%cLN8%N|x-(NUn-4 z_CDONayhHfPiuBePXH~{H%iZr(m5^_XSo*fxA7{M2lB0>;fpwsNe$~uAQ>Bo$K^@2 z)T*6=DXNYG6MQj=VkI$z!gL;PFQnOsoZwABht3Dg>lIm+%Hp~;TY>sh~C<8 z!|$ohPFSmfAIe9i8+Jxlcg#;-w9^x0Qjs*wH;3tIjnD~ ztJtk$a}<7N=)paTrsCBBhDOZMLvrZEg#thDsOQ3OMgb&K$Q`PC9YRZMa#TC3{nB&# zR-0@|8M}-s$8uybpe0Z)(p%n^E2fkx1UNtQ(YPOtOL-#PB8FfOUIsxvhN;5C%pjka zwni!RCA%^6TCUCBWQUyLYQ>bV1-;Bd9pY5m*O66Zn6rGR3J-c$kjMy50qQEOE0dU| zX#Aa?TLJf_@ARff9$3;RVe)Xb9CD4j4(4I%Y3<14G|@LAiXur2n=$^@b4wwPKe#xl zcK~s>x8qgT7oK-a5(j)$TuhQF-J=AE%KbC>(s?S(S`|=z2Nda)#-U?G+us#kUEyEv zhB(|tIi{GJ9Mg+3R&6G!CBwLcS6+OjIn3+OvGT(G_TL|2L5I=ZR z467_^0wH1Co@&lb)UrV{{IMxu4_c@dU{`c1`500$ky;{dXhkY5s*K^c1uKl_=qgL6 zXl=}~I!FfxXHoc8YTP-8$_t4{`nTv;@u=fg5*B0gsp+0*NOvY!3fc+dFClJ1qiy1t z2$5MzupYFnaU;){RL^6MN8?etLz5?#8cg%^ur*{cf*~7W(6IFBQ#3nKBtni)-uZbH zyHLcg>PCGVrA{J^j~q8#W|hTaq=e11Oi(xXkpQUN`Av_#p4D^g=p5w>;1)PI#YC|y zOcHIVNcl+}DcmeY6nobg#W4hq-@8sn@pi2O1alP(tP1SmaZ%gELzQsDk(J6mDcFbzcnAjn^yyyC1o7|em#{D%cW>rlo?WD~XO-cCS!PQ7n)))CKw|=hqb+TZ~M_!8z&&IH>NJ?f|wIH3htktAYt$ zO-+S!MlzmCCR@z&M&aLWESto?yGAO#Nvg5PA8X~8+NXP#wYLB&Z}T zH|{-2ITaJ#sZqC=6p`$c%wS|=w{vgx&menLamJCKHRCN7;5TQC`c5}=ljjq-j#t3$-?oIQDSah3l2QB9@xz^ zY}{0@%jjyl#H8?>aJ-P*H6lyp<>o~JN3L=7rC_;vk`W%&TxGLSw#MR4i;UF;zYX#M zoSq9)ZX|4F1defxD5jz-oi>+zvFCz+cqAKo@toD)6-bjMjH`kPbD zpbwb$^{VYNZ8;nR^J1pN+_A8%yGxwy-HtFn8l`i3Hw}%)u6i0tZvrn8e4~tmS2S%3 zZBplI$h!w5u+A#fLgl{?X*T+W!+otL0sdgal{MB~Ud3Suw6h8b`B?f?f4S2v{>yAs zupcy_VzQz}nSfrI{Oh7IvI0@&PQ)lDKb>Ag7Lb5g89h4jPEj3{K;RiqK~$OD*t;HR z8qk=@xR&*#-_1jwC?^zHcO+*O8DUIgnrSou;&DtZO=I4lYi}*(%0UX{+t;8Trhyi; zoi+6<&8H$2=127Q`qrkus-?6ePO|}aj5a@nRQK0Wcza4)YhRjX{v-TH`^LEzEbLjB zvlEg}b4&yorOgM9^quK|w9?cTruLv$1Jaw?jL;1L1DbcWAf=#Uj8ndp(bk=s1TPe3 zo8FWfVQ|emG?}aV7Mi-QtrffELOyv&>J(4~A;_aO(s;JhPSWGGwzG|rb|s%JpO?4! z_O57j%>XWGzgk*K1TK9A6lRQ2F==;Gd(!%Kpa%|=Pz@bLAP$tH6yCJqflLPqaHjUB z3ILEAYY*0(DLS&_sbD`^m$1>*Yh;ZYLH_9dYL|y{V7dPQd}s2jn(4-sFT1bvs=gfm z09AGV^8WxT!F(pKb0Yr$jZW+Sy0um(inFK;YIj5anpI=;t)sD#&_oh=7#}0y&b*8U0;CP^8`qx3= z`~Lv5YE5XzApw#3`&DRztdxyB#ahxpb2>0SP( z3hUk+^C9J(1hJ9Xhi~R9&(!CU$JT)d=Sxe)0C7Oa3JnTq?MP!treofuJkv(Y(_5XA0#N>ypoB;_DbJ;5e3QqO9FEl3rk2t2d5^d? zHnCflH61lC;=^GkQSv|d`&TDC5?V~jFvPKqr4Vf!aog6rxZNj=G7xi7PiGaz-#R3H zoeAwkqNEA5aJLXKNJugQskE0l{HcJDzFbmnk2&X)UCQa2TQ*+K9p|oBH6YXU_|M5_ z68#WVdolAPh1ygB!6%{r018M%k^J#V^ZA1Zem2*8_FEDM?jz%;$~dJETCe=DkN)`- zdoXiGMv;9b#AZ8qW~9RQqCFY()dvX-P%|ZT)LRiFW8zDmlo_F9X`FDycX$ zJ-F>ml1mtT*;tNBA#Q1P7=&|29^7qH{OZ<^Xi1h{eo_TizHo0Gx6-GnS1D`|Q zx^jpXXySaFs4jwzvfKPz^ra z5M5iIN{sQIGg`A)n|amB0D(4jB7b9A}Y|Mmkh2HxrGvDe4Ql^4&QBrBK-P zt-TQ~bn6=*HtWk%e=Pq1yj{jks)KL0UbTa%Ik#lFe2CTaxb&y0i@PBX=?tUhjr`02 zr>`HCQ<7SfE8jz{@dmM?Urj8~T%v_S&Z;+7ZN2@BP>HPX9_mNi6mdfz%*Q+*K~!SW zEa6`&OPom}4a%Sc&st;`150T|fN0&Ypp^N)40o)# zv49MWdhP?M>sGuL@i)Y)JH?13N{zB0a#!Tq02N4SY4$gJ$fGv`FhGq!kg+{j)t?UA z+efb=z>>#z40+udZOl8Kd(x*>UqYIFPhQk5o#T>Qpsd?|So}Z zmNfl7R)0JsZ9Kf%+sNF?f4%gko9XQ5HrJO1?(nhs)cmnz-Dc9(3FUa=Ssj)s zW#?!E1xKc83+XNIrt+a{cyoZhP$65t=UlJedYdU|ZOwCHVz^~iWU?$ievU<9-kWQs zRJMKbykVWd+w*rHO0ljjvgqDUojJI;J5k~(wCDJ_tUWMDBbUyH%9Th^G0rMsOQ@$K zQfVU_Sw_$}#W%}Rq2`kaY6eZtuPX@Aa@z-9oj)3?#(T0Ng$djDx%;)MRxvI-!J~3A zQF?pTH@SG_hDg|iR>(ufYf3m`^J0L$utzAi-V~#&B(MietYLi#6tnG3HHy3$! zL-U-0*9MWzmAr90uFEFe0gM4Y@IK_khpx zr__Tz96yMik&J@LS)9`ojo;35mPk+*!j6eFcJ7D(hqC>e8&r>UyU3^K@v zxcPHOP}l?_M2$Db&fNTiEI%IACDOmj-e&!yl6ur4UC0VCUJl_`EaO>oxz;BC)$~Q3(5ReDn&UmSs+9!dcnjbWO z2nTM{>sY<@Eh}s_o!MblOnGaJ1Dt1?YJukb(HG~f7utocngy+~EKUo%9oaqVrKY0y z(@GL1SmWr&xuweObUtHZ*57+b<<>Ms+)wybJBg-~2#ot1&`_dfC7H(BvlZl8b|!bp zKC$N?g(S+2ib#{8*~(_G#8o0eAxw6jJcXyK1Cg zIDdM2vC1~opUTU>7{~Eb5t(9=FhdgDZsM+8IZ1-E9OFFo#b(^ex-e5ax1HHv;q5`0 zE}YQY-ZM$egdAeDAb1+uNhRV_{3oqpTaPYa=k==>cM!;h-b7ai7+%!5H4EHVjpl9? zm5AUmJk)H`iDEL#h*_}2E=Z{@=ZZ*QlZ0I4{{RuFtz?$wU88CR<%#BpbW&vSpqk|s z+`|bU>zNJ(PLM-xuN+dxasy}0PW5gnu5ITMq|y)v+-?w$rDT1p&Apc>(Wwk#f#=e5*)v`Uc0xYU`pxeEd1*#Iw5%U*2*cBet51(+wU_M&|(|Y~dxavzn z0Ph~?!e=-+ryEwQoh1yg&{ls zZ<8P2?@wz`6`T$adUIo>1O-N5FZD-#(cyn`GK-71C-;BnOVsmp92`qa3h zqU>GMXMrUza(1eZm3HKlRjV zo@%(62T_ytq+QaS0r}KY1ztOf5&~Z{BMJpNBr`Ex!F|nFM?k0N!KnPl4ES!e>`NxQ zwn!WPJ|d8Mjt}Kj_x!}>o&epmZ#19+c^`#ZLPoZqbkZDiyKXa9?U&2GWH*LRa{mDA zx1g+u5@=J+Ok>k^F>abCF6on=am8fm-(y898yB{aM|NIi-WX)A>>8Iy2r7CWq*GCQF}6t#r)@5l_O~|0Wd0c=UPsomh+pq< z>rF;?A9%OcoC318dex<4JKRQN(>#0Aqiwl4=Av*y=M+HSya0-!T##h3Y#fhjv`VBx zQp4$0jGkPFIp>Zka2q@lI|`c(5_!9uJAkQC)657PuUsCrn=p9J8A#0}JN)G##CYd> z9+de)=Ci2B$&8-;#Z!RG6q9Z~YIVUBqV##EVFi6^Up2~ySjkXuaZ1etAvUs4iK9|e z`@Jd49HD%@pD_ER^`}nK#ERfDAnJw3Kar}o>e9-Q`tzODcO@2}Rub-J^8ItgTuHE7 z%T;nlIpkG#N4R+g^e-6cjk)?&XxyW*P)8X*?9{z=F6Ehb!~<3Lj%VqB_n^`{VB2Pj{#pcBm@J`f2~D^Kb4fcau%n(>&F~oHTkoTy^Pd9Z$lb{ zVdj4lcQqZ{sKFXFD|pEjamn1(h5U8k_2qVmWY?Tq7^VVz7|{Dg2xqt4yN@cx8VZb*+J zd2&X|?Yp53f%wy8Xq#bUgVSj2DjSIs5;v#J-Niw>dX^kwt}!LZzO#WA71%y_X5@Yp zo81mu3{QHkWgs^ylQt0Nf_S9`i*eIEDK)5EwoXJ!!Jd7z1IGe{e?>wYbp%>$nVO(v&A`(}6oq25GEt%B(TPJC4M$gSZm?59dk~ ze(j%oIRKpJnr7e?bNW-FBPVo$a5K+(K*@}9D*VrZoE(moNei^iyXHLwPbel(a!Y!e zj7rK_C3{{VK9)J=$zbI)3Q z!}kTV^s1;g%&pLXLBXq7+%Vk*RfbcFR%sV_+&hC-i$2ydCe`GCc*v%$plQ@1;QEtI zP0(ac#bk9V!T#<$P{cY(L(9G_vh~~DeW~T&KQi;wRcKhVDz6zhS|XC>b;|tenI9;n zVq1Lz>Q{?AeQ-FWTiHdCgsSY{zNJSkYq^tZkC$;=Rauo8!vTTQj`c}go3u(LFCaU% zFMRZ-$tom+h@>E36crgoEajPX_OywJ-Ujq14hvDeB(Ptx;2qNbCGxjR(ln zEk4U;@_9hzi#C5M&D__r>_9?wRa-gvo z1Jg`~M!`fS1Tw$(QbC1zQL!s%ZWo~sF&y}|BKzQ1|zgoi8bvM+mON@wDgkS4G znOf%NTg#WYoA0sd{n6=6&M5~q+0*mpt)6fK#cUQkr8IG@bfT4FRYSq~UA3@KuMCr#5$8)9E{{Z9`!>7)@ezi}-pK|N{_5T3S zYR`xcG}GTQEmrXF+`9h&e19szd?vk2y+mVFx&(jAPCp@4P6b@l{{ZAIbN=xDLaDc< zZyk)D>9+16A{h( z)c(J%L8M_M;*f@%Q$gmRQJ=j`2?424)FPgw&;74CV?@<9V%B5w z{{RCX-qhV^P5V0=s8jcBa^QanrU=ZZ2B^bkmo$b<4J||(o(%&WDd5yupf@ytvmC64 za-15Ry3D`QscywNB-(N+Mww}kxZb@7 z)~X$QaUbZijQ!r9H^0)8l0c(#OLH$;3;zJJYa#qb!CMxt+eaiCqW&nkiJd;qoBF6H zjD2c((Oj(Qgo)iLclQ~tX!xGwDgOXOLVD!(roV_USv=H^5UMgj2R}+V(76vp(+8KP zi2?rrSXsXgmCxKR>z8c*02Ob>&boECK;LIV!#{T;C;8SM;OtLRRUJgth=T>-^G+Zz z;)h%u@kyGYCMMPdbj1cd^U{=)?M=b-q%tzEo@u5$^&C}!*P3Zw^ueN>wkt9odG+oq zow(5jrQw^-D;_rIuWIW;2ILya@dCs&<@4FqJmn;y$vhq_I+4=lLR#43ufNh&Edwm5 z7@fHJNuebB9kjN{AYJRV-17K8!l$)#*8c!zx^A9%C38;sIDZowzH4{BvbAiT>k(N?}~_1N>^cam5NI&l1Q;l z9P`Z1Sw{SP$AAyDdXK|v18w46Mn+H^SVpbReq!JKYVzg2TQDwqNC)nlug%jn-uyPz zZyH@oQITbt#J2Y3BOiD+)Qdlq3K*1a3{gPOPNNlK+V$mul)zz2ar4P1 z0gtt3Nie&(SR!DUZb?wU{U}MAHkq>=M($ZvBadttaVG9_^rW+Cts{~bw!k1`N&p6M0*zstL;aU`$eP;g_!*8{wR=&xcx`R3e26@m zzFnJf^5crjnVM+w#FTTrRGjUs;#s4l-@`La_SoTvt~hT0 z09wh@qk_)PYw30$EwGW6<(uj0R;{m|Wv?#YK@cQQlv9mlpP_ zqlH!K{{ULk({0i_scs&^0I`pekp17SVcc8X!6}m{$_XF?+OMpEt&O@Pki;`=Z_1pL zOG8-6btXib<>t7MY?1iK`y-`AI1yX4hk&OaTC+6%Tt(fO{J}+4xbp7pLdP&?4gRnC z(oK^qY>e5KNu*N~5V-`@x|C8!X0wnAvkk>S&0Vmwj@=l|k(1B0MSp7?vnQ1!7|-yL z-mNn$7j3K(*)m=J=+5k9R(;F8yB{d+8PEIGwQeDf(S)pqR^*vn<*PFCXp$fiHyrg& zMMPrns5jV=?^Fo$cB>#w-7+(dhOq>$WM_9DG2{w!LAmp_hJU;S3O9|`Wwc{ zYG+*Af2Sm(F|ZDixH)`ed^?BCAHy42dAwhR?+#wH_Kr5spMF}Abg{-4aehCDG5ip0xTPF<@bAXDrX`v z4_-PNsUj&~zxdM+m~dUQNhSuJm!8>XxiFv`mN?Z(A6(TLH7RYSf3s>5IBun)3;I=c zj(Ja*kX%Z=3nCMWvYIq6bFfaQa&A&VAB9p$Jx1NODxo(r7Cm~B1z(13I@^P2(@FjL z5Aqd47AAZHZ?PC+>w@)`CFj2xN%v?<_!(QE8Uq z9kUsGe5H8JOFp45+Zj?Y(POI$!cSWwkzRPkxL~ZiPi5m34X&9b(--+lbs47IN+e=< z`U(yCg4DzlhU(j~Lb^ldDG9)kw(!LFrZkdBTN^_ARVF{aI}S{p(+seFYoyp1v_ zQU(d-zLjDpg>YB#6d|I9SM3>A`3FplW|g^oMhkrgY8jh+%v5kqJ~GkZjBoK){ZrYJIXw ziX%9b^v|tLGaDOOZDM%jk@%8TAD9%=NOscZQFS9^Ss&hFX;E1Uh9mH&tRP`R_3ux0 zR6AN;eGe6))PZKWQtF?=JJba3&ePMSMD2-6kxq$Bhbm7TQ)Q>@pec!~jUXQ*Qp`ys!KQrnk8QFdsV)Z1kozzShHr z!0)!9c-|}%3#@I?it$Z5iI?6*Xk8syn;$6yx2US3Um?G8QMZ2<-!3U*1I2ieP_32PLh~;SC83(ASOHtggk}B;xN%a(rQRVqng6AL}wM#Cf zV6YkOSM3WCE988v2R}-tfdU<&@b9&_V0+YaHto0{GM+w_Yh}#KGo%~iP&YO>T7m+s z#JY@+Q^i2;Or%S1u_N6ooble3?bU6TM4NlJ4H!ZUXWum$Cf&DeI~${|OZKE#Jj|7q z56V}mq{f+0mO!VI&os#9!}2qp)pk~n;zp4fLGCu~&(^bEDH39Nb#4YbRU{(gPU9s< zxuNf!WmcZoU|s}(a^IatEVJ24CXEik)$zqwzPyU~L_{qV>-UH}Fs7m=HST3;H(@dU z!iCS^D@+~HsUUpAHIo+9#%o>$+?nae6;DFoLS~W~FtZ5U@TAj4+=Tq+&Lr}Ops4i# zM$$UfA;}rwP~uI^hnX&wV)DGX*Bcvd4m~&(62~&c5;+~VlerZR7e4hT_KmpOHWQ3| z%++3d=^Ehw0HsMBK|WBQ#hyQvW|J>L*OGj(qaDf=VX$$#_=QknNDH2nt#Hia<;YRA z4CgrLX;4YS6yw}hu~s5li{+~*Bjw#zEp2?|bI1VosEW4b?VjG1p$;Op2nks?XCUJv z(9&Z_o*Q`K`Oh+RG*2vIHbBI;s?~RxxA6jMxswN`O--&hBeP{$A-0i6-?xl`kM?UD zRZk}9ylK~-J66qv>o14&{ln(+nG0w8tbe6sc!nowtz=FRrE(Z3;4d7SgKKOK z4&^)^$MLD5Ozz%ulgF^B861qiKFJjdpO@e1R$^C;2I78|14|T7o!B|!BA8}oMv$>F zuNVYmP=jCV5~u%|hGgjZ92dmhF}NJzhkjx$WZyhmRxFp#h7SqFW~Ya({i-#_DZ`}zKj}_WO&rsO07?(bQrdHW zY&~UP=TMF+#fjaiTYuL@sg}cAo4z3b0FZM30M|vRJUPU7Tnz4CIqHTy<>tH!NA)PLwAsh)jm-mm`vA!)zhKgy|-TSsF#ZZWDnQ_VEu ztdS`Qr+ze<;(!`OG&s#a6%>)~7!-y})|1+i5UTRo#VN%AA6i346qwC13DTDylu`-+ zaHp>wYCMeMoEw@zKD*ruQMSF&QY)F_S3qN7BZVY>KGneW;+?QJ2cN9~Pj_mzce6no ztD-Ro>URP=R6j~l$3c&!9QL4LicX%C>QBOe6bf(@(Mz6a1S<+=DnZhkMIabPDKSqX zo72{i2u>+QAzX?FA6j5M9CWExqNk~(G)fLCMLDQNNTz|3jtz9WPNKJweVXG1_T!)9 z70nOEl^x=7By2+ikVk4jXlibN)|P>yZq$^(O*~Qo)Zl59cce09a;GA!tE{W|V zqS-Rwe9`*XKOmMuM1Q(sf7SX{GK)p2+c41c?3E&^8s<2$j(kcZ<0OQLikItr% zluFRDFlffs9R_>Vh@WeDX|)efQ~YTm?sqAz+1}Wg6ZerxD=M;QY@~T{=qz8w+Ir;hB22Q%rTo-v{9v(kK2^;V7kT@?DeV&&gFSgx!b!?1fUq zAb*&J=kTeG#kHN%HI!2r_HA~TA2;Dy^4+ozF}D@nwMH;N_p7$>eWy=tLEbp+LD;Ux z8`~WPWY8&GnN{^0L17GAH%Y&7=RaR+&$U}iON*Ruk)!!{fq|O1n$GKKV7F9-=i9uF zK2R}%RO7SrZQRBHCAMd;J!p!tNtbVDp6f}qLokkCnY`2=F$DBAGo^yqhnPzeSN8-a{)fWgvC_4_s5!mhM%tZ!ULr1a3G3 zJZBwh{IEP*30$h+upPmyH**{14mrotuE%w~8+j_Ae@aq|dZETj)@2Ll^4Z)k-R`}s zI`LWzq;fZxAP(R+4e7;fN@F(hM&mg<^HpvjhG_h~ys;R^TGARVNN=#sJ5Rcr(WKd! z`Y!KW^HZga>cl%nnCj&41z(CgxFuL_ZV^;3Fz4pSZk_78nD;Zaqe{8?Op}ra=}F4$ z3Q@a#g9q>NPn>%~4q5YAyJ>iOtLHBR>GNbV8Uztx_G zgu4pj7Pq#$+>-g+5XkAY)$Jouc!9UGf@!6Y1RID88v9lxlAD{Sky8ba09BZ8=7JKC z6^wO!bz}9b)P9XwZhjTiaEVYly)`Eg*}P$G&O@mEw&fj$P4p2NdXS) zCRVp@yLu5&74l$Y19)$gDy=1`(TGTdlZ|Ng|GZU`YDbaz<6wPrSpR^c97w8AbiAaqUI>>FC|s^E#o-edXw#1Pje)Z zx*~1dy?0|3QD;_NgOWOt%{3+#GvXiV8*=OiLstVb=N%17s8N{1+MKb-gK7CrYP9H7m02+3kPSjfyOP@>i9)%2lHSzwaUgu}Ak~Os zGp_f*f8roxH5B%5u$1SXrnF6L#~E0LY5dG?#(jknTxJA}cB;lXV{S;PUc#3v{4#YTk+9rDTMc2Jytbo63c8Vji3M9RK%lTpl;t9EM$I}qdr++FK8Vx~tGsRw^J9Pad~6e_}I ziY1UQT-95P zlgzA^@rG6OkTLwK*oB?|hT>L5;CZJ%mT9G0(iV8IS-38G55lF$wHuteN)<>@0X^!> zZ4SVuAXxX!MA4u@obEj;QEMpu?$BZblbY?iGfKo)7!1G+R7mA{6tn!j1}PA3AY+Pb z&s&ZxS%)MCdvUO~OEqpd{j&4G*$ zK~raBTWNBPNnFb|*vZ?Pu*dAy#Q;Rf``F1pO064te=!+yc&S--#M8`66Ze1m)JS96 zM~)CsZfcqs$%MNQ*y~i*Q}%JZu6?TV>Gv(BtOR-J%pppr?6eWw}pHB6M)TV_3~NH5w8cZpGcf6}IE`wb4BZ{BQ4 zWNh}`QOJ_UtG)7_R57a&Tw2F)u`t~kUzxocpt_sRP`KI$Z04pQ#z`Na^8~^6=~1*s zO{e}^FX|{pWXghY)Z-~u3g_{yZcc-(*rGgcfP>tQMM|q5o_5GU2v0myT|n62?v2M$ z*wh4JM_kgeNtgC!D5ROccPDh@)0QMd@-I$u{cAYfyIcL7^am)wa0g0*fW=ge+jB}0lH4>%EtGH58hs;{0mQi;jPO2l?FvE9Wa4Kk9%2nD}gU?EP4Myb_2w@Tu zR8$}aAaT!n)RHkZsgKN3cX6IERnjQSCz^ndy7)ak#Z+1*NUh|{6M@L|Bbrkbq9Y?A zRn96~mXh8_jSDg<``c=-m|3z-GHgi)lqTCDAx{}KV)7Y@LL)3sA15_hW`))_k0Hkr zDevwo=i3?>)+=>s7j8dz*vI8X*qSr8_vC$P4+jKdrkY8Rc{~14I2q18vsKnc$XqdBMf$8l~xd(;M<3fY!&L{%RE4Yq1EsnJ?O}WS3s;I08VmPUh@^R2q zv;~FeOQv(&8cAYSC^Aa-Iq6N@pyNDur=*A*Vd>V8!Z_nDytWkbJ*nk+VHnzlv&}nv zNPCZbbf^ok1g9f3tit7%dzj;Gj>{WxKtFn+jTjTS0<0=H+YjlQbdsoGG6B!677LS! z(O?itK2Uz~GhhR4&Jb(%GspH!4^4M(S1bGG3mhM6UPeB8hvxnxnvyun z4^{0zi5P;|7|(M}z~tbZ)8n^b8Mg4=y{cIt93H_#b;TiA==4@c(ISifdC1^?AR1kA z4Z2%7iL(^SFntettDresdx&2I`Kv1W0gNB%TT`lBLwRiM36(bh-lCjJdmN)2Laffj zbmOinOLy50a#}V#G3$?7q9taLqW}(YK=rGM2K#>`mko}i)}fKHF_Z?tV%xi5RAO`y zkkPp(E3`QNbrg2V6v~d-ZcjbwT_OPOa2)ag$Kyx{C0R^}h&F8)A&w0~o@|81#Ffa& zAH`Oq(UctkcE-KEYd#}27{_C?gE%w zKS}^n)Y54Qrk17xvs9LLlHc8>ygg6eZpuAPLWuJ^#}bxyI8Zy9>>!5Q!@66^E)1vr zX&jbr$|w-a``dj7PD^`id4v$2r1xW7X?Ky?-HSUm=40H{mv@riT(t9j=IA>p^r**5 zU^_Sz)7FmM(i1?$rk=eiOH*@9LgdnC0-MfyQl7K`w4>=rI@0x^1W!N7jHP?N)U0yHvNw_x}K;R0qE`b4uq_hw6WoM7AnBn*JgG0FM~I>#F|%BT)Ez zgt{J#zsS_SAwTF){{Yuf{HiYwecPkc=3nG%74R4KiWp%xe zT;%)H4Jw~%0C?$2dQ)@9r7m(Q0F<3+O(7Hl8chQphK}@cKngu+;;Zg`_}hW&Qv>Tj z5~-(xbBc*+>qrX<)cz{CH7c40Shl%)S(aG(^dR>hmD0-`8ohwQ89b@(tz52nt9BRf zXE;4#KY4$xARv|&f@2%-%T2C>P_$d<1koR#83)$6?p2DBh8$2d#T-+??@XhoVL%e+ zico2aX?lu79t}7VQ3{X|O2A4OhTJ%-DB@MY>r}%5PgloELnmb@*;9j7pt|3MJU^$c zR#_57#^P9eaZx(5{L9pIscWh!Gkr=oPAe-`(h}t!OK^g0e8B$z7vtKCJHp^Bza#o) zwBc_xPqhltR$*&waBXRJKKv^X--5D|IH%kXG03)65bpV1I0xIB>9u_zU7tEUq^dc` z8>^R|Cy9c)fTQmhYynuxHYrNvGCM;e{N$DXFu4J}spbCwdcIt;IAOSw1tc&l?muXG zBL_PI;c?j2sP;o@$pL?!EswoH$}lFF(|hHc;iLVzn@Nz`Vrc_Vf*TPfSdaDJcT zSty8&AUkq87ygty*gINGanqh@rq;r0`P z6|lLt8^;l&HhnXJTR1+oR%rbD!4Yz+d>b{d!`% zSC6=^cTMx=@d`^QK3r=i)3+P5#dIvkr!=FmOL3bOxS&O}bz&(`G|le37C&0DNUkhl zxr~PfyH5yOt*3^gi3SIoBXjkxH+fz`&39S}#(`{cjlrAssgkl9dYC>nW^E$vP>szC z0ne>@BH7BY9o}9uw;=Jm`?c$SCHdpo*Er(5{`*BHQ!E_DqahD1p7^Zg7t5idvvy}A zb7WjE${7#d2O}R!CbAbgbTPZLC9%Rrcu}6G>sw```^b!~yY7+BYQ5s6ubdkp$=Xzm z@_N=6cLz+#Sfp2po&&MZ$iQ*+rrb)FNwzj~xw7bbeiftU$#4ewScDr?V}Ly?BKjg5 zrfWpIiq@HwcxH zQGp~LKPrZm$ah1UOtuIZs`Ja{Ge)1o>)w?d1l-$4J8_YdHcm^MbcQ+9AqN`;a}!kmJ7RoI)F7TT&?-CW!l^DK#kV%-mO zQ%57s3H{TeNH)e#JB?=BKt|D?aCocMkOv6C)D=z43I?bpl(_P+j*k13>;SC*O7jb;L%ZT^ zbB{``J&b2yF5fZvjw(X&TQNYH>Gy|vb?JAxh6{MuvmMAm$Zu>?S}-ueOC8tBuC9kF zaaO*=BQYxMOqjtV13mI;(p%l#bVe~FWS8i*X3=MF^^O7s&jT5&=NgHRc7_-ugE=en zIrTLix|IQ*t?Ds_V5*-lW+RhSOJS6DtrNNiSbNEd0i-MkKzdVJ?qvkUBOq^-fO}Lp zJ#ob#IXK{oe5h9~N2*=h#IZ$iL}Nc9{{R+iV#X;g^wzkRVe$+rspOyT=CC}qUUAJi zW^Che+n$-?rch5|T-cWR=FCw$h*$zrPN4UvOAN9qu;&7+!*K*w6UfLJ!Bs~0s}fx9 zSk)Mj++wk6+Z2cWn{+s1gNn|Q2}A9Ss}uRv>2u|yFe=&UOpP95<|HTI91q5lWmvTm ztWl5=LV6Rpli!-I8cA@R0lTeEwq^@(WVl33;X;kno?5$wGGGCKM-$w0C#~XVMxugt`ua=JLbCZ<7KaDow1Zt8@(kJ&>Qy_jc>?@17hcG#aFzO{78q(RP z$#Ahnb|iLOk(_~nSxqkMuPup@it_uiMNbS93?ssdGsKv|sFKjqGVUY6XKNRo8@z;b z<*@HltIU+-*?ZD#y?)me22WW1I#!6V!9;d+{=`4J^>w4FxO z+8aN3=BixBCA7o|3n}DQgf^jMPQ|$y;;GFzyOYXDom?Hc$zXp9sJ(7WL_yWDk(2L3 zwlV?*_7$fFo?^h0%aus#q=G67t5N1LUIkY+p{YZ206}5JBy6$CnIl&ojhudzvRnCS zxsS|x3W?H8ebcucYele=v9oN@O$!*|*guF4{{T$YZ!OOECvTuVsx-XYYO#elQOkAB zS%k?ZmZ)~8&%b-OC)*X1O*2kp(Ly*&-y)KBANFfap?Hfa!HaJMcQrAUf+%>|+;;~U zrnz|#s5?*b9A<{@W>#%F_sf;Rc=~KYa;@~Jka?~N!j$Aw+xI#0#Wik*)p>T8 z^c|>I89HhMZsH{Qdi=S~SzCpf33z7NkM82Jh1{!ixT=KCaKA6+YewKR1!N;@4DQ{5 zQz#=?O2%481g<`E$I#Whe7jSfwVe!RH_xYPl-@Fvv{vpndKp}0#Igc%h%xFjP$M*r zjFa@M@z1|300x?yW8K31-`%IpWno(6Qb`w-sFC79)yI0SUCwh>N<>toONk3)vGl7E zz$3%H7!LTS&0tZ~&jy~4z4_@^q?$s*YO1ebRC85kJ6W)RA3;=|%C&)4eZnuvwD)6* zk#za4t{cvC8ZSbuKb1!cGd4lN9Pad|$8;f+6EKCqx!>|v$#OLy> z_L_Q45~^xIAo-*iz#pGlu49W;xDhq4+3p*WC}V~`oYg7qTHoZE!H^zQR5#2d>}y;@ zZ#$KZo>DqyK%?>WsGCjlV|S6Pau;v{;hR3(RVXD1`?1d9%`ygLnWf3|7w#ZU-~DP5 zXdR0-$tD$>O;wG$I~LnkOp-=67?o80(v6UQnH5w`aNoSTm0};lp1*}OOC0m>lH2f|w;Ao4 zNYe~kc0RRY-bV9^MgtM^WcH%=8bd5plO!_&0R-cxG{wV6!vqyLAf9S;cXwjvU}4nv zt8HqRdTYqjs4<_H0|macn<+$~3S^ywEBq#|dG3-bOanp(5n~IVaw?2!u{rayaz9#p zgd>czG%d~#@YJ<%xoUfcmSug4n^$Pt(xZlNCQtQau5(Q#ULrs^9V&a2kIPdSVU!FB z{v+H}=(GZJBv%Q&NcnkmE!XK)B#POY84D^u3Vmvkc2oCAZp3j;OPK~2s^>U9wMrp! z+e73GY;YUyy8+EwjN(A5Rzxd+*&}Ov)pT+q9&)MParciEAe|!RQE-O@kDF=r#Y;?R zxX|0iKmp5lC)TN~DJvf;TuqkDZN@R*G_hQnEz3yZ@ZCVi=}0!n-eO#+{`Z(Tq?17> za;q8O`?m-S0!wbHUQ@GeP)Y5MrnDz(W-7soKSQ-}rjiHUyY9E};Qs&@<60+TEpnn; zKP*t6@uoG!%Sgya&QCzNBDBT4Q3*VVAAeLHe_yCPQVm7RnB$F6!tx|QNHqDML>662 z5~PtvT5yB=qjxo1dgH!oTZ!blIenmKAO*!l5DwLAZAfr?o{Jqx_N*JuovvWD=S$O5Osu%;_|Ve;Hz*^Xl%HuNgSMKu(ktH`~3@2>vFj=n~wCml20-^yD7ZBrtr0hFq}dYTU-nen1Vt;ei7mhrLA|v&)I2 zP>}}5mC=-9rsA_AbcjcV+z;_~6(5!#Ds%ICRh5e~Hc$XK1&v$=2raSxwY@;4a*F9` z8%LdlzePFbn9=_6PbjI%stgR&&3PPewRbk$bQv`wuvC?qi>rOr=qS`fVjGliYc>bWO5&k=h+|wxKIsPp)D0xedW=X(0CB|v=191+rL4e0*)4(nc&TK(zE)D~ zgC{HNk6O--;HE%}k(`s(oUZ;`M*jf3Nj_zau1sF}?ew57SYV!GZiEIoZaP$vKbBrz z80QA6Z)AynQb_rC5PMXMDJWS^*D1RM9yq1A5=}WtalBw5X3jLVM(Vz zmlul)K%2YgZcA5Pq-b|qcoSXP+lA}%oVVfWR}=-UBg4?xpt-y`lyiV^7N)+`113J~ zWA3kZsIPC#pl9Vl{64ioVr!YhxFip{Ju76AC9$@!O+2P)&))l}IxT8`?$LbW?bC?+ z6YEHJTE)5l00HY+v&{3!x%2EjC>jZ_9^zf@bLc%OyRpq66m_5l-S11gif=(pEkF)0 znrJlK^F=Tno}|+Yifa>F{vOf<%rEX2_eg(sr}&Sr(tx>tL(>;ewm0`&Ov#l8q-*{) zfvjqcs$BxV>J#n{>sLHOtecry&I6c%{o`lvA5W!Vfr?@s`cl$nkheV00+UTAwI+RO zx#oZj@l7P=f@vv$zS*S|l%Rn?4yDaE>M6tNKmen!)|cx^KnqLKjwv%h4knRBC+kig z^ay-jSeXpj5r)fPX1WbS#LshL_VCX??*`M93O3g`b3nxaDviA;%^uWZfC6akN!FBf z#Q-fQS_LRH0K8IWli1Sz1p^#0gF&M-nauz)Mcs_k(NzZ&cc%fydQc&Hd(>|vnZmAq zrlj-tbKUe$9)8P(zXG5so#=~uK-{{RzM{{Y-?{{Vph09wg>1^uF)w7At+ z{{Yi9toG?w^~0T0{@Jhfss!;{Lt-Vl8T6*(ttqC}k|(_c;*3&f6aXC14L2r}wE#;< zI#YAaD$*DlN)JAigiryINu1J=#WX1QrXg}qT2Mtcoj?oF=A5mOP5N}DTy~@cfbmP) ztsteKVaL{hJw+6BpaT>Rv;#&>06LroDS-Rav+GC&BQ%(&1H~cspaz;xtvJ(84FDmf z{AoJzMF2b+a+6W!oti8J%5zUsiiI1Rlu{Zc27$#!rk|Qt16NAaO}qIrAKkZ1_N%(C zo#!vwU>`6aFdhBto(H{d==z^aX-~3~-heT{$67I3y1tv_zuICSx^KJ(@YYal&@n|k z9Me7NMIa>YK&Fat6bvcA;*+H(dH|RbITd0z4lsJvMLntcDrpa5XM^V6qvjpz)uq{t z;m#YiK^(!x*F4fnoa9tAQf7>EmT#Q%Rqu40Yvm{INPmb`bd;P1JdX8x84Hl7udOv_ zqXw<1$K2c8LwfMcBw?No+-_djsSx0De6im()?Qf0cL9oC{RrwSlfBbsu(}(HyknJY z99B|=wkb$vOox{&Gt#4Q+lVCy9kEsfvG4gASo$qJ1Pn{0V?EEUK<-kN_Yt|kjYkQv z3Y*VpHhE6Lk1LM#A(khHETK+z=d~#j$rwIn(mm-hqkBx&qL~sKm-6Mu40;;cia_cC z{KrFF5sBrAm0B_752ktjYW9z-OK0-UhK!KBx6D23SXAi9lsg@Z0zGQHWVWKHP_|bX zU)MC)CywEF$0qbAtx?n@kYC%!6J=Sxe09ZK-h{()Vw*K6UwfUxM!`KYYTFWjhc#Yv zomLVC7h~XiaZfLU+Z5om6rQ8WVcQg&f&TysZy_ z=+06i#hgmSDs>(6S=Wd6jgQv52hrza=5C5*&JIQjudlsw8iIwnf8t)ovq`6^O{cKS z3%~R@;gG6br;OS@w7#;XYVy2Vqa9uLI&Z=W2m3he{fmx)wGP(@laC=c<*wnbW7WS%;smTX<9lffGGvv1z z%@4ieMvgXqKX_Gm&%Ic;({VPh@Hbf%d zAXI(nk_l7l4K&z+us=$^kUK{n^-%17RUC6%)1CA*j7qW>Mn$&I zB;*mEY9b_7ci4%wesPXZ;a4y932mkF{J7E71^JX@em&~~_foo=S!0pYA-byacevILE-P(p4TA@@wzrfq z!^ma%zY2Vok+f}hGrAn(5VblHNv+Xky z+n@fuOwwf|LJN@ET-PSjCS~M8dgIrM$?~Km%%U_W=GtY+ADvm#w7ZDR?(QSrxcj@0 z*A)fykjWf2>C40SotFgEq_1RE$KPJdd?S`rQ{|vq&0z~aqtlBz00AS~qmjAXz>i!C z+A*;CnW1xWcXq2JFuNp6kKK&onyWRm3hLu~9oZf0M@cr(LMOC0LCE=;&d`4fj#vKx zND;`G-N-$wCo4N3CxTg84jB?d>b)vCW4N3snUJn?gj^a}+BluF+;B}o+JlJ*BP-Aa zrMu7xpX~5BXJuAy0B*m9RMaF!j0n!`ec@0`adRwt8b)RGJk=JQ#Bg}6V;hj&iPz5B z0=n%zyNYr_C_9uXZ&OW%UonO|)rFWXz>J|Wj&s{JQcRT1NM>7mMSHmbX8tE^=jl`$ z?9CzAtL1jV>s9p&gL60!_i>C4)mbA1|jYX|_YqI2}c0O2tq%z{~1tOAC)N zB!PG!Za^tDp;2KZ1{WK@i`S)BoNa7v9QsnFtdBX%76YqikHVWfZIhlr?OH&RND&=E zmSoRbknE55dCY2a#X;w>ZV)K!1v)k?z?BPw&T3j?M6>w;7s^RDo=cYF@THZ(fmH4o zJt?;rZ6*Ybya9pfRhHzT#Cuee)YT+WJa+TTAbrdf9x~kGq>|nwca*7A0{~|wzZ#Y@ z~jintil(`7sD4$0kxUkEL@L z5V9Skk)HLcx`~S2Ab~NpG1mf>MB303Mb_JLE(ER;Q_OD18*3}bV9Yr;|&iF2s^uqj$`J={#t-k{lcX0tLq4y7Jd(IY8&{F z!VHn5GX~nN8P4JFPWp>t=zB!cl4+l9*W6gj^#1_sR97zqad~0ib~$xk1!TRg%BJ>Z z5spK&su$qCgMhkmM0NGbD%(6q}n49>ERGUN-n53Qw6}oIY zfInJp%xdUEbsFtSBix;MIHZyxDw!is-RGb=#YJ$+tP*Q=U=IwE5NdeN6?~L(hp1ne z)3uPErErXZkDF)Gt-#Jx%1@Z%Dj4*qvg|^Sxp#B1#tkr`5rCv`^&XW2NPM73in6BX zUW$7fpz(Q3OKu;9Uu1C@V}jYi&swdSC%`f>QT!*R5zqoIZOH?12RLepzIn4IQ+FGX zXBB8i{)acq&Q~ASk|CIO1>-y#w_vwIui0Eo${4ZidSXD?$9rjvS7(Hd%*zVdFGZoUD1NAkcZ_1xuo;O#3>3v zB9zJDxcsVXQr7Uan{e-d7>w)19lkoK}od zT|Y8`GCvtp&)_PB;1IcpBPdqJ(lLWVZu%0^+|CJx%{v3ul%2WAJkn!u;zy5~md4ro8kw2;ombZ|D8>c2x$JZbiFfrqg@b5hXA-sVe|Me^1E01%j{ zE|5Vd3B!LM(yiM@u_!Kx>_-8|-K8w3Fb^ta82UAv7 z#vHL?!{yqWdTy!G-*OaY01g!5tC(pBma<7}xFm&foQ2}5&2I5Rts-p)p|0ajLH&S6 zFp7X@J-Svs^_ZI0K*f<&vIwS~(AkFsYPQ~Rb#^U+o3KZ{PZ-|LfQ4_oq;vRGut5k{ zJ2x_Nx#pTYv}Rpn;4&{4^a32XK?)mO%yAo)ykj#1lVaLG zJ#pTGTPU@Hp@v9jVp&c>AL`C=ik5qBWX!Dbs)hT7K&!afA^A`#=LGbp$1{N4Fx^4v zMaGH~TOCe)kXf*1P`eW(*4>YT^;bn%QX~pE6CdeXI(D;TrrNW-8J)iLndcsrpnO3L z@P6zREzrLhtrdweb!`(*(!7B3qLy&1Bgw{l=Ct9`v}OUt&D#uM?81Xw?!PgfB@32y zU~($9pQ!2)e7|Ou)OV)hicIVdlb~ZBMcL1z(hEHj)sq%dkmRmi_hY?Kw!EH24EE9| z)Kro@l9l-g+G$vN8!*eJTd;e3G)El@tyaC%8c&(q3E66zJFo!;ee+VTmn6-;XZk4U zD&p)ZF2y8?uIAoXfOr+D`@1MZzhB`VwJSw!VF`CU{{RT}tXqqQo^&;4aabm} zl6gzNE$va~6x@(_rN(*UfeIR&IHU`XX$hcU#+jM{qukISpqx?@ik8mKOZ&BgE(?9* z*+;D~E$G@&)b5CrBHBJ=Z(sIBY-{?u*+cd_LZVW;c>e&FYgVsrt@Ir&QWWQ+Cw6aB zTtt#I(=@Uj(mBf?dSrsqlS$r}JkT*IGeIK_|5#=NX`21sR~yccufe&lI@N zS`^aefDlu?=>W|rpaV3bkPK4xpamZElu?=hQBLpPkaX!tPih1dZ5y-ADB_R8fE1c; zd82_rGytPL>2u8{IG|#H9YrM}Gyu7$15N4HoCO0N_)=z!`%-qG2ITIkGsQ3r^{0wr z9nCvF^xWc_X$+Kp6!DskDn;Ue9yzTY3};gw$lTU-Y3N5c@BaW?kMk9sPlTxMTJaAt z+kd`z{3*U2I2R-R-|I`_Bl24Nvwtd24)Sil;6K*0Uje7uDeIa3(;l8${{ULgBD8hF zf3%a+%N+hDvzX$whr(wq#x+IX6)DbhOlq=3q&OL+KD5)#02%e55=9{E)|V!L7p*&? zre>73O#nFcrm&^$O(&W|LG3{lQSD4ad(b%VLF-0H??4Y7IiPXgnN9>}ts#;EX~U&3 zb>^HX0ePdES^(mJ0YMn#3On&eb3g_~DEFitX>);!0Cg=lH153OkO)UK+I~MOX%rwD zPvJ@1np{#EU>!{^eQ7A^^q>chDrIcdaC1^ABfTIervuGK4o6yldO$pmXjdV#o|NA7 zyz@avQKa9kbypF(+sq0PzgG#`SNg~>FL^mE=;f)rzD<~1Z zGuWDvYko2@QqgIaBe)m=kI>YUDo+ze>KPlTth9ZtwDjp$pnMPs$n0oOqXxQ|8ithC z(jh^*ES<7#A1~IS@?ed|_rPTw1NiI8*ag*jE&CxbI$}Alqt*{nSzVSD3<8rj0{#&IjvW zpLM_^(xO95;dSGiz&^E8kp$m{x4ET`-ERIJ$LbDEFM z{`V9i5V+$%H>U!)e-lKJYcm8;tF=f_FgjOw&ZPCOE5(C$u*7vSFEuhQEXvUAEwBFB z_Ms~*E`<_5;%|DI>q}TCbBFTrvOaPTzpYyEgEGFZinz)~-s2e+cJA4vmokdigduI@AMT8O+SI#9y!W?RqG7)w0mgRs#a5aGi9_zl zJ$F#0b2&oA7}i3>qJSUe=AtP)hhjWx8TNu$4r*(cMwN_ifQ1B)PkMZ6u_o}VliZBe z85z>UC@PM}lS~V-jt+e)>9|P|q}%i4fK5{kg#- zdQvusEupw{yNw-5MslS_eF5~W7r78$d1(wz6KH2f=zIPZ&u)E<71rlnW+r_@Yi;`} zINtvNFXNi6dm-HU467S(LFYZGsdX8%jLzzsT7fwaiw!g18{3mmpQf(mGVL>c41cjIR3? zK2-+*52a=an5>&*ZH#imHN7ZFJxH9^#Y4?3cI@xw1$I%?EnTs;h38o={#$h;B>w;r z?6t3{86)|h2@5vZ;z73+)pMP)v;xeb2@J8akG&%deiXYB>OF{x3!dYz<50r~o0n+3 z&oAcMI^w2V>v5@ULO+DmF&1>~HFW8>TzWfxPQL+sf^2&62eYwCrohvB@Jz3M_j;2^cvGP%vq& zO3LWLN6ojTRl}@-!*brWE629hSmPOKTXT`zQb-=7A=DwVh%K^4$s=M%2a!=d+}ElR zxKBY;{{XU4W!&;@AZKW)?W1on1Ob5K6xv7`B0;l`-qk&Vt4{v_yuy3&QyW>qSy2=Z zyRqCZew9Q!nWaUImS9NQMmee6Rub;zqq>nJk+GZ-hN8D)E2t+YsWlav8=He_PT4cj zm%SIZ?O}LRZzGeBy5CBr*kjG>Mpa@8Q*qys$*5+7&wiQfcT2p)qYoYVsnFl1*i1-O9a*BfYwk=)Js~JkG^K0aWf0?Zra$>m9erI zm17?&mg;Ji<-^DLPgxqmJ@fA2`xE&k}JVYWAVnnUvJ zCoWv@2YS#KdRw}!>M>F{bs5`PlR72TpJkC=5yG%iGx^o4y~ zScWU;D_AmE#F==FM?EPXZL@4A3@OQIj^x4~QP?Nm6x%yvG_$h>8N88$PBiIM*RxZQ`SsI<#jKGh7$%eQz|Am?EB9+cq(%PP9L zMjynct`9xKUE4IUOuRV2{Kh*~+^ETQDcj0-fJa`Gmr02aHC_k91JKmh5{V?ptBh>P zdl9L}K~8=cXQe`9B;fQFVOMlHBeg!FL(&Ouuo5OHKZ_hwE}=-}Pc2Y(Bym#N$cghA zTX6fRJ5$jimSY_LX7YGGwVRVn)h!El$qZ2W^2`-S3P9XTdR0fef3ycx>Bl3Amgmc0 zw$NZI{*?m7E1`0CBGjb1icb0hjb~wxD0;%MOFZ^9=Qp(@$ zaBr2Ag>C`sRV_UPmCB%(e4$q(*X0!=vXXnT>$GC6G)nU%YY`|ia5&8*fh`bR7>fpN zppMlX$g2t(F||;(-uuNiS9X^fO@pHB{Ma;3~rjL7omYWrrZz1Wb5;X(e^(sqxgYHxHFv27wokjmG-LiE@`Z2ktQtkK(E zzQ+=w7%u1t`r@4SlT7QiG6?HcJjb$>#_+007&cE@aYALOCD=kO<+)S;04$YR2jQBl z75&_AA_x{pf#nKr6*Sk2J<*gZjIVK#nz3(l3%=Z~yxVcP8v&}e-4J9V%R4uf5oH{e zQMeEAtJd+h(iM&qA1e%0H#f38CMd6@5tH-8pCBjKjlTBVbri7ROM_gw$31!Ja`ycdY^ig1Xk+by4tLBjm5yMxvV## zors7n^PKuqlVdNZr4pE~Hq8j~$puac=B*))IMEU#yzpJme_E#?Lq)u1FOor@z1OW) z5h@MLKY816;;t-_Bd}}@r=}^A0@3bX26NAPl_G_PXJ~M7wm=9mU|OA{Dd zkWh8O?V4En00I03LeeCM%lX$Qk_R}Z$+#+{cS`Zxx;P*c(2uQJl6Oe%rTyE1Ay(jU0HC}m|Nm=FjnTaIl zy;O~*fxbdlJRf>-CEFwqLVasR8yU%wH!XlMO;AjmIyFo>`qBf3-Z*bsq)M?9D=RV{ z8BQuNIjLq0?#@X)^XpajFlwX(j!4EwH7sl;8#d$Km>VMpkxCcl`CDnED_o72=G&T; zgR=tdlYqJQ?M%l!^Y2zO#V|;o5@TGm;{zO0goarok@6W^=Le&YT7(_HE1lm;YLFW_ zAoi&p%U6)a=KO;g7!^uEh?kF>KQXJt{Mc~A3l0H$8PJKOT79>!m!P}fJ?cb}C7g#{*ywvTKH?V<23|*V=~Q_2G4!Bi>E~F;2%|ktDY4x% zvo6J4;OC(g3&zW|MuX+~_&Fx6t+J}`E6H3k=kHQ0Tb+-D_granMJfOkuqt}?$MdG# zTO^U{Fe+{+3W4}!AI_WL*9s+sh??Z7WSxCi`dytfKb zcYW-9-&(HwW3=8?yu#Veds9ePE+(pA^Gn2xasuvbaa-OSy|mPIAd}k#eyTSd8szP5 z9`=2znFqJ6clsWI4wY;txgWhes-NMjLq^=mWosE@7BUFNQtp0((xM7G(vE2k0j42#igyE=Xl^MBW`T}p=Za=Y z6w}t1tvd)z^FcII8L>bPjUH)MDAc@+cBmcew6nCg&~(R^XB)osvU;AtRc{aJKVgNg z?l_&sS3G%ZSeovrbqV~mVr`>8FZ*ZKn3+#dzL!$Gd8No55j!f?d}4vtj+6jrG!sFj z%>V@Am`ymEV^FsM&op$T&omR7UrMiJj-|yVD4++AN^+WKrAWL`1HQGV zpnu2oyl3}+*0Rq7rE2(mV(w%A08Gd8tm=Fv$8vuWANckj?foe5>tpRW{{UzD(s+dP zYySWO{*-ug@7;Ut{p(q0qci~S)B&F2o1Q7do|J%6G4-aC zlbTO;j)2}Wr%N_e0H z)|47=ekpQA08XZ*Q&j=xrdFwACdth^ITapwrwRafO0j8oC8X&p9-T+1s5DZvy{7~pxO%`-n*c7G}Wc5_B4pwraQF{Dys6)C0>NC8-_X@i@0o@t*{oUUv9jPtv2mu?XVrogyuRU)p62`^@vs0Tt8- z1aVkemW7}6i!#BKWT5Z#shlsW6kWF>TZk1mXAC-n)})H^B^go4`FZ1-&4M`0A?JA* zFYxB3TZqhTw(*td4n}iWyPBG}8oujc^Pyq#`kJ9UNPp5y;!`5+3%Bmo8D*Lwg4=-Q zaJ?yDG9)c3m&xY@V2WDx9mR&1xEwxT=K}qK_pLt zyx{dFp5=zpT(rx~ow+@QH71yxQ#*Hz$I5r6G2SZ&PuCvMh;L3{WB17BI^R|WgT*9l zjoo?8bSb7~DA-#V3u=~Y)HLb*>(cJe-ul-Eqn|fivR(-DL9XWUu#HMglzINF{{ZRD z^r{C0aB9}5#NR&N)kqVXp^Leen#4C;)R_c#6&n@xpa{uTIb+3gz9WoFs{*Iw@|T}p zwbQz*x6RtQuMnq`eG{VN_jMD!iisA58rfXPQ15BeYp~F3W z-P$;zj~lbp{V`fHYO&ifSmQrxC->zJ?c%fT?&Wup2rk9IQ{3m;p{s0HGM40iTxl8F zk(jq|9ddsU5M?i_QRPOdKWn-HChj}LooAjQM#!4soa!TYj}RnVp&;KzT!yb zzZ$BF_Sew0ysm*?G7;2{;C*Xia3=YDoJWns`ukRO+^G>!40xBbvE%ZjfVrv7KBF|! zyF78vf#j?FpbmhJ^_6cF8=X9m8BaNE=Ctl4g(4eY<|C8XRTT(;l|OsZoK>_zdy6|6 za5%sm0aPE(53_kA)3sS;CPL+o2LO(h9(bm~3nfRRtKXrytyB5mR_a0_;;dX4eP!x?9ZbeTdI5DJBgkTFXEAs*ES?zKDjTO90wHKB>-MwmQuAb;CzC6WE znH|MTU58^r8_PnGh)9u`a!yH8?kf72c4gRrTmnGGD+c@+V^cSdO2KRE|-@_vgK zRv7?qA*AdjYzkUfLL!-?4?IO&@w+{0(G8_C?Ts0m<}KG18%ZdY%WYN-v`EF7< z2GVe?{6d=)>gcWuu^fEIkyNaZT)Sjtdl#B-mUdh*DaaLvbpwJ~Nj%_t*2MGPOKuEa zV&lALlUdQ-uGU#Hq%TsZnj^i+Z8J7J?%GZot>$e0+&KD9bq!#NHJ4fLtr!bxU} za1Nk!M*yGBs!s00j7aqRFSQq9K2+p`SIWT*KQ`1x8-NF`WzL$aMJ2k$a*;FV8%{qu zbkQspOT5VCeq3jz4Fg5{>7k!VfWrR(c%A{Qp%b><@;@Cn5^BYbxtz~4 ze6Idm5yt+MuA;@{WXo$SE_#99nvgn=nAQGK%D4b~aw=4gJ-_W9d8f_<7O=!~B;`Vm z!z7xvki#TF*Kj#F=eN?c`NE#mnrvo|*y4s92EXLI*U6VC5i*3;}RJfkhbMilh&-*K%hqOM3xixg#&A-ntNIsEDReca0WW$!_mF<^=2hB^c=<@GF)qu2 zA~(zr3I{dN#b};Xh~8@}eBN0<<5AgKMvCMo%z6{rt}?Q*QfZ<%C6{A~NCR(U)~MZH zaF`h?FahW){qzfO6MT$vxiT>r`~_vfrpbDNp6WF3wUmR8tr3ddk!zN8)N8w>Sveg@ z9qQ%t!)nLu-efL=4H-XrlGOczH(1FJkCoS)Rcn~0pEC&_R)4+?P42;HjBuV{+`pYl zBXT-%>r=BVz$hUb*A*;iRW`R;=u#R)HqiO9g<=AZN%pBOE>cOhAi|CYDYD3sLRCgs zo=sGROUMm7bzy#l$aYKu`qjDD%ZtpA{_y%z!uHlG(n!kF{{RR)h5rES)d_CR%*3`z z`%!DVj`l*N(<7akJGdYYYW!Q460yHg$Q4QnOb@v@Zll(;tqr+XcXu+e$pLt%kk+MH z;AD+9BOTlldgiQYmZ@tq%QS=K>T#7*=ufZGmDP)79zqYRQ{@)zZjs&I&lz923{ZfJ z`qpxH(Uq{vo2#khK1LQU%^zRJt)vfeV;mBRBMaOPwMN1h`%>*JfSwC=0=3c1q?Y+gv(GHN!N;ZSfGIQ*(f3f!v%e0p(osT9+}OE> z*vJtz2e^Vkn8W;~e!Y!RYguA%^m1Nea`MjE&#i zN)75kX=!gYp}BUH2=lmX;QCsmU!+_w;mEj=S!NP78lC@g-N ztW_+KpgZ>t#B`*t&}_}<(((^9$8(UU?ifMc^zT=owv@us-!w?fg^*yV{EcJ%wrFkR zyPhJ^vD??~{xuO$lFu2GNfdxIZVhG2a!e}eT|$f zhzk;jBaeE=5?q!|{{VS7M;&qXtB~AH6oOHe*tp`VN%RQX#ffyI4Z)A?(6gA#xLmg1 zKylpFKQd@s36ziT0x^oPZ(8R5X>DzcQy6g}jO3s8usT#y-3z-o;Msv0++31fo7|d| zwbW%qn1c8u1Mg006m4V#$fu|@sDpJYL zDX52`PsXJK2d7#r0$xKDZmA;W9Rb{(N4TuuuA_?4aNi+6K~c?ZB&;*S`wFF@GAc^j z6Wr4W3!hqFDegXHQbM*fQ%s&hs`dA(xt^pal^o;RqS=$)rrf47JNBrSMm+IU1A-p7 zri0IFo;D7wjmAe9rz~i(f<_OeA&>w-%``DBpXFC1w{Nov212uK{VP0z81^EBJC*|< z!j!HEBxBy22k#Et{xxltm0B0UXwN6>K!uHh?cc!f=}w&HT%wP=v>envRFSY=K5u$f zkoko^Q|M`R08_K&AP;)66l~vcXK$ra3eriArmnQ@d)f7^xm~R#SjkO;$z|u;@{}YLw55hfL?@$0CF~ori=j zcWSPo@s->LIP3oa*RGFLCOM;GPtM(|gYd1_k*ee%6(7|EoVxCR6P z)NrAHGgai0SpvSoRE%`@t6Rd5RX%b#U@=+O5=_S#We2DStu!IiqN(!mO9P&@G!_>( z2n2AJ`Hn#+nk{XvFB}uRD&vqiuR+oo>})`pgEv*_nzXJvbt>q(L^@@vUC$X3mEjwz z{c6PbCQ@C`PJ*Mmy5395Ycq4)z6T3h+mu(iDR!Pu_G;uY#cvsz{**%s!x@rJ5qtMF zO6GRBlRXe{2c;{G4Y|2{xi=macv0z0RKVtiUMX>k0C!4h=9d&6oX`Sv>&T`ON$LQn zXEfRZ;q{}|lf5BoU?Vmwcf;C8oZ|lFe{{UWr}&SpQqlCYsNC*w-)4;b=lkF6is&`X zS#;}Ff(!|ncB12iUs?oE@gA^Pn;1zve)Nz12D#3(-2B9`B%Y+w_B6u6_N1i1pqd5* z(@Fg3CltfitpXjm(hj`R91}-+U@185Kops!{AdAbK**+y)4F@m1HCCh#W$zDCVNl= ziK7|yqcpkCb3hBykeWRyC;?ZRaO1XVI#T9<0(t3=YF_!K^`}rEG<#A}j8FkUq@^E? z053`nCs9D41p*c-X8ncT{(JbG^6vRJQ@M^;(#0tN)Kv_H1$680RGgp zfYO1@AQw3p6rkdeQjT*##Kk2z@@YM&0YxaL=Mk|`~2FhaK@(yhx1$hlL&?kZZjNsAnq zInNbs9Chtkh^+(iNfmNFRI;4VorcQS`eH~-@F3sSkbf0o8IcPli6TFB7|5=nWXZ)x zb843gTMNG!$sI*0Li(X8Si-kRu28&Wkf$Fo=9uv-e5OsO@~N&SD8?}F9^B+(3-zd$ z;wy#B(T2zCSi7284HiWt#&O?~)`@(m3dqJa=c5njR;AQ-K*ep`5-=5y6`Lk@{Lj>7 zlpKmop_+xjeBsl3j;Yff>RTN`dly+EQ1S2?OI1+>dy%9~s@*q^DqXL&;BP84`^d4T z;#=I(zffxDp!iPu+W!DvwDQg*<%%AogI&$@VkY_;@~?*Q?(vE_;HA93E)S+N`q!yy z2lp@Zt+}p|mo1%+0`0e6OY54d?BMpRQ%IuXKvy|_wAKo8HrA-jqRi*p>r7w(EbIUS zIVXy>BAgr!D<0*UXGq)a5{`4WJdS%(V3^2tXeGP0j5B$Y#Lki9?*Mo??V8Bf?w(tS z(pV*dwqs-W8LfMbvkSSRf;A}zle-@?Htw8yVzaL7qms@qwfTnTJhM4&2XLU)a@e@; zXXtXw`qCt68!|-OK{zD+eJRMdI-HAfdFCy$o>X->Ha?J(sm+*qegH%MA0BRd=KVwJ84)t*@*QJkvu$X(d{Jt~N} zF-p@(60?qjqUYAFJdE@2fy4@%LImmtI#qd~%9rxV$6Rw%kTH=tm0D|!uG!&0=}?x) ze2t?#U{teQEwdYSK+ZVIbBc`^v_QM$4uA@F5gAu12|q7ticgyh=~k`@RF9w?s;Z+r z$O!p&eBQ#EU^XOJ-CvT~^v)|n@_gIif2{uibaO;a7?D%#<%a%>d0|O(sxAoy3b#8! z<$bDesLc5C7bmHv$x9)`gagG(1&z(%%qh5g@m02=a;|aosnNb*04Fu2Wu)iKOT{SQ z{JVL@SzRt9!v&DD@B#UM6G_PdmbUWB*})2WkTY8gaQ3#4rsPX>JcRr1-`+meS>l_` z7-v)qfC0d$iZ;`{fCqOtsCtU!`|U*}K(?vpC-FE36;2z3hDDl4n6i(Q6TqqL{&d3%w8VeguRUIw*ykp}YlP{TMMT87~mvay+@hz=C?t2VlDytk5dAS8d^ z=zWDdE!k;{ro7h^jgmdV4tH7YanleWh%^$JA+R9 z2BR#EE2Oe3a%Gc&PnP3SzLqp}+lzacib3Y5m)18q$I#SwD>THctXL=ay9$~cc(lmc zM`beo(iHKUAT4muxj@`|_xe=Jd!sh>Dn$S+0&?9kP)^Tnbc`JC{^?q=bnk69$tHZh zc9p?j$GuXX^X)Sk6m8G7IHtg3Hs)t7lbqCRaU)DN_w7tud0Jxm#afQrWNb$UwX%v8 zz+Ns{Br+@i01EBmw4!LuvMBRgjoY^I`PNj{`{aqtjh{?by_7MfusfYMch3j#tfyqO z8f&SSI)9|@L^H-tPs)EHYfUVzm@KyNHMt#Q^EZB7>QF97AtX_s_O|5I4{Xt>k)sA* z;=^b4sk>-2xZ9Vw+@Q*)H61gJwJp44w(*_)%|`OP49a7T8HX)Twv>?lA05}V31}M< zo4I3?YGiwgNp2fwP0oFI6+eK>=Ay3{g-M?^Zak!X=NgE!87^_xt-#{=#nUEAv54}Gw@)5f=3Y;m&KJ=>o zS}{4xbgq%4MD4y!y=v98;@5L3{KR84`$YRP`R<>1a0uyD+IZ%jzEBsir*PQ23ftet zQKAUlLGMtsUPEjvimso!Y@;4Itr)ebf`pbhrjOB=pU$ULHD+g?TL5R8)q>&+SjZ^H zBkv4nKaEXo6c?eL1aV8AH*^(RI9A&7NUk1P3pVeWyYpFGqQc#l<7U&3T=P{3Y1sXsnFe?p zx>6v<vR#<*8QUHM5pE6c#+F+fu$FmE@Dj5$(=4 zWYshxNAnXn9F8$f2$51oj4LT!-MuLdyo-PXI^!gYpUii41Y~~^=}9!%i3~_Qwm8Wg z(*{#FVm8Q3;8lTkjH&>|S5y2YIyYKumaQb4(2Snjs?tQ)? z!}?S++nL8Q>|yOo8%+wBq}apz!>OUJ!!&Km7TB3{jN=uVCi(BTOx|2wly{|q?`t{< zc7`}3H8A$e4(_zu{BoWTX#9Mv=Dl;Ibwefl=z}qEteNz7DLLn zf5wqvM9QF*m~;X@Dw0NbWpW&0%cwr)q)FLuQIsEXQ^{;1Z=EpN2?2ii;AW{Crey#oKD0(=al3AD&S(>srKkS@R0!m$=}GpcLPJM| z_4TGIzF&XKlQsGd~wmz~3JKQB*emh|p{Im+XT6V%Zy3hbMTG6BKXWOJQNVoh2wn3Ujdk04le6D_yD>lYlum@JT_Do=$4K%J&i~<+gwZ z7x~iwwE-qGTkcm{4m;y@*Hx4JOKJ*e0& zT+}X?QW2x7sE>}dp9~v+)!|(}UVSP?+ZDsQM&G>Wo`)Ujt0-0ssKYr7ed)H?iCGY? z3k>eh<55j-g&30v*bKe1+M=F#*XCw92Ot4en*D<^E=n-tcUox<<%x)m_9O1V>sIuw zH8155&$vcU-vX0EvzJo2SjU&0g@JB_itSk-(X{^nGF6RUF`hkYqZT0{v9r=)*onEZ z#_FLv?U2mm4`EJwpD-ESvZQAvdkVR1B3S&^KOp^SsbsLn zzHAUXkyYfA%{b_zpcI(VB=<&l^ilUkMjISbYyzCBI?w^0^u1`?%_+d313Xl0)TM_? zjG6#l-6*7`Es6j}Y*e-uQtDSr1Vi1MA9a0d7~_)SWQG;q?r@+EwcA+P!J+BRz0@{0 zOCj!Qh%P3!(Dcx#Fp4m%-P*W`=aTAoo@1FD^ZpfMQq*5ky7Nv%Ys6{jDyLduU>wqu zicHcsN&sBb8;Uc}6ok+MZsLQ;q%@tV0m6{cKxuxS2(LjfTig2cZPVGPo zN;ovoa%o3O0C=eZsMnurka0+2lZE%KzYmTs54!&VTEuRZx#92iS0}dh{#Bh%gwh+4 zc!T|Qzx%EH>E0g5d#-wg{*eNPQl5a&F%AU=ibF{NyyBbBG@Sa;{5Yl{Kb-&pN3|iK zLx)Oonqa3220R}0{S2J-19>M%hZaGYlDJwKobsm zr)HSFDd5rpX*0z;wKwpfb`Y9Cd8boPdSVcYD9st51*FXbttB8FDphQ8Q0AOE_n-?Z zaw)+2R0!Frg)0u^fcjF?1wXba4Fa)744fS1y3H$4OXv)8V92B92e__Ip9RF0(=4&J z)(12Rm-THmtfuo2e&Npt`=`>g$2HemPX?=IRmMXMGM>h{yZa}xxG2Lb)mzk12&+p+ z>qP?!P;*I|2dx7gO{Em0tsooF&}sPU2~im97M`JKS@+Au0DQp(~fvoFn-`AFa}{{TIzyiUwSVNpgo zZuNzlX1BSvscSL;!+LX?jcg;3vnX`Q$m0Nh6-6PA8+V#NFqr;#V<#iGH6m(~q$IkW z{{Rmh5k<$T+IR-cL$6Hxw-mKulkUR19ZA1%{{Twjz8g;^)~Tw@a3e@j5PFi_n(Osh zHrqL?zQlDnG`Wgfsi17DtDG}*Z)zk4H#@UjZmoXfSh+~qMi_DrBCgnYl0{}qxLCrf zHn2G(>+4pht(h{9Q>7f^1lE1sq|?rUB$7D5V#Qmj?Nh^cv%s@JLnCyl(fq~hhV(V2 zhGitQNR~xFz({up_P&~9J8?JC)&5}T*Dsr`HtYAD9I$$EqCTZ z_RPqLt&SArvFs0e&Zfn|U7Y&q&m22pw~^jAQ#|B=7@n%5zA4@!f?HT@-o-Y%Ka@`2 zUiHvw1MFIPogG4=hm&&P*EV%$)pba1mzyz%E>GX~q@IB!u3XV$g5Dzxw;pq6J5Qx^ zx7vN<-cK#OEMitWU$Ac9m2T;F^1|s8#5Q@dVc)2aJZ-HD>o>WzTWIF;V=fDo9F_E@ z)yX)$LdpRcXh9oQRg2{AZvOyE%l+@n?-ByU;DNyOuESUG@xHthTfdNv_r~LbM|@Tt z-jyw!%jYBQ10lyu)U*aF#i(5QjS8&)0P?}v2OWOyE2NcRywXb;5MC+3!YXV%#z$JA zb>~?nxr%FW@EcP#b)b&Rs)x z$;Dc@wNzM9l!nL*BwQb-9qBHG)Qff=EwVwiLT8{q&Yq&NkkBM^mQDxfNZZLOssdxR zOK~HetRql*I3BeF&Br|j0N`%6Wlg^}?x(T9>rB6n(PR&8H<*w2mpr9S5D6fk_>oGk z2dz+Ke=LXC5BoH&t-|CIgkDueDry&kFDg5U5kI)60-Lo%a!LBsEyHJ}Yio$s(lrk) zrTb*tV~KYHdXi7;Qpp|N<@lC7#~g$1o(L6BPLkbjlg&_DyC2~m)i$H5yWSy+HCVvt zeJUE%4G*(~Z@rK|8moR#E*aU5N5}^i1UIGJ`HrfJzaZ(8=}}8^R%}SUG5FH4HY677 zDN>VpIO+f=ch;nnO_m7Sa&r(k`DznztiK?~wM1?lF?MYIdr(p>G#2e<*E2ts+1iVq zTk!2xw5X(#!s0+lgmxqEQb7g0%C7PG=tg?r8e6N0EhAqjG;x8oV-JB@%dv4Xjl{__ zvY=s}^*FhCn4wUtIUI9WB#bNp{G?e}ZpT`zkwYwzqO^iy4)S_u+MhBVwJlrhz~0zB z{pz)~x7sE$I>zggb~9sd)|q>}CsW{XCiIAc<%e`_l>sFw+VJF&UTu6s@(lMIMipp7TLZJH9>-bp&XlBAj zFcWXgO%h9EIX7}E-9c|Bnot>Xeo>03`AlyHoS8sa#z^iPtukheMZJr#V}ci``c*A7 zL2*`3HdI7xde!KJkgE{ek^5sJwX~akFj>bm7EKW&mzjgRRZH1C=?K_jja%0lsFby_ zNtGNp0Td=l>dlYVrH1rd!HHAOpLP*Y4Ws;OMA<1B3d@d!aZ)4-PZo(F^v*H#sR9^V zQfq)_)3&C6)?28}4uv`Voqx z7Z!MertRzg?s=-H$U)sw%Ly>~#x7a0yE&+q66Xk~9r6uQLLg%CpOMbd+NFl+hD3;@ z9>K+R76h$FjnajwrIbew&}(}I#$MesoDP*zH6M8AG{%>6dez9~D-O-0 zo=+4@L?o`n7Vx^`qL$ENCqPVx@9pOU z)YPUHD!hdTDI=OkU7>Kkw5(&4o!f_@Bl=T3(277b>>}bJx|a@1HZ!-jJ?a@1)%ajg z{{VNTR};upl~}5`-p6{E?GLq3a7pBoM=Jtsqr|MI2fwXSzLG^z9A}2_nr+cfs^e} zz>p(Ixg7pA2}v3bPGe)w; zM+}Fdrv+9FuF`BjDA@dQ?Nb(tHS*Jc&CgHkQXe51BiL0GXA!1X>P~7otW}Et0B5mj zqd1dqM#$QKI-V#jOEG3ZSMH%r+5Z4PN^peB^6`6;h(6avkg(#*DNYERO!LVUR z%wt>=QJB(aF>OKA?vU1frl`wsrP-8c1>2wHOuV_0d(EhzE_&7SvK-N&C8{m6Y@5s( z4Yfv8xAUlNwMh_|X7e5peqO~sl+QT=g=ZkI1df#i&XK;xAoL`u%{Z+!6J?3pI0+$O z-^AT%v8fwZISNAzccf>4gd$b>dS^8%Mxmwlb)o~)MGrx6777jJw2#ZR)JCd!6S-BxyIElkKg4yk7H4!gc5e{ z$*K;7&ldw606U!1kyIg4aro7FE{MqGcJ`=Rc^$wVD6=NwUt5Jh>Oo`o$?Phkt^h(# z-oH~*zbfqHZC{w?tO2=WY!9X?t_608ykKn@>NC((mWdlpuBEXyYIj!iX-2=e5P;AWM_ zVgy-=$gazUA1idLR~*^GJA?q$E-|xd9A>%*bhW#)wE!`1E0PT~^(h#ULA`f$JGjW= zrkJoi62$SiRjZp*Yb0n9j7+78Ju*0_K?p1R#ydwKW~O_BMP;{y&9Nhx00z>1YH;lo zV+#KOD{U$VEALawA}c6HU{BuTwNh^_QDSD1!j7(Y6H~YgbtH!fk}d({igHOY+#TZU z!3qx)*pVE$Wyn2`YL(N>StE}d;~TO%QD8#`%y}yAvL1vAiqrRs@hpO2a%j)QvveNxxBPuIKuNll_HTm$J#y5CZa?FUpx4~J84M{{Efg2V-(2=Cx$s2 zVI_XO>JxH>n1R7H9oh($6=K0-ocj9GwYU1T5j>80&!DD|4D%XS#>FGrw{(9G8<|7R z8-+i_21QSz_;PufW_ZG`^u`F^2muNqAc)+bFVMycN+$g3&3Pl-q z;0}4J*H=b%!tjcC6a|F2oZLU64uYcq3UA@f8ShAAi&4AMlOBSKU^_mv#(D{+F((v& zLfp}wXrPfy0B|`p$0Yv%Dn@KqN8!C3y9@h~@(yA{{73uL#9q*JkL>t<(>Tq`uYdVw zzMope*R^lA&-Qp*{T?&PAI9I|SNun;`FpIb-!d@$>wk2Ack5iR;A6D_IC0*WG};N~ zm3)7_siZe-^Py*!B$31DD^zNd77wJek_MjialwyGkNk>{j zXaF>J#UV7A$)*Fk3I=GPQUTP4icY41j_p9{N#23?s1V?P3S9HnldUKe3=T6u#yF&* zaA}BerK7DBfQDPh(Cry!_7yx##AJ+Sm?K>7z|XBnF~tKV6y@zq6!G<-1ua{?9t7LY zdwGNTRu{0X{{Rn<^htYWU+60;zX_x_E9>9!^9QmY=TGqY8?H~L-|JCp$Ne%cf90R` zr}%u~?!Jlt0JBtmf&H<*BL4tcY=7SVm6i_`sp2jg*ZtQQBxQ*;;GHOPs z>p~dQ(-ef9aY|_@0i$+#r4*UYXzM@ri_|gam@fOY29g{(vwI8G@10E98wNx zfB-1RG-H}t0B=e|?M^)@4FCf@=|ROM13r`iCP}A`Y9UV?cA#W~HDzqBT0b#u<|~e& z4h>b#Yq;>G_uBN6Z0))2zD|F-dQuZ-5<@Exz=mE3J!yUGpz&3iit0PSKh}JbeMduB z9AbfpD8@|`r=r)2JR|cM~NDCm-fyG1(KQx%hI?!np{*>wlPhooY zHv&P*AH2S`)kzdOwU1naGK@PK<4DD6X*c59b&g;4Onde|v|L45?c}$aqL5&%L`OB& z>NW{~2bmxG)gLMK70ODkj7A7xdQddRp1e^_EPMV~rw){c8RnLgr7skQ86~sSp0vGb z2Na-CAUq0T>rd<6n0TNB&N@}QJJSR0epC5XLMb{@u+Eoa%NTSy2c=M&+EkfkR$v$S zLEzPE9YL+zV}GkXt8g++9#rHSmabA{P!*OidK_{MJxTkgtyg6+IAu7gM5^it;O3_7 z7i7mu5@a#RyM=Q)mYCKr=K%61epdePv96oU!S|r;kxDw{PXrpKl!#5moKnb+h1YW{ zHq)QGnt7UWA@YjJ5x`WJ_2RB-dRN$y<&B)%ZWutnb3I8um6#Si-zM%lVDnf?Ejt@0 zbb2qtC?<*ZsilX?S7?DR`G;b1WS}QO_jo_6)Am=$1YWv1FSN7gg zS|l#Zj27mBSm3VOOO0CT#&FZOPg-)s!STsGIjuW8Y1V0WMH@S0<30PHYCL(QZzV`Q zF-^+iMN(Uv{|gI_kG|F zIrlYlcSON7&TGUTeV7}a?%9`j*@A(T&NGhmMQa3Tgc+5dQ#cwMVGVN5p!3ns%o9V=%<%p-g{siibt> zhPNH$h#uZEx8-096X+|Tm1fm+;r8_qi(kCh-If(b1&jocM;*C@>=G3@-Rnp!PaMYN zc{euMg{66X_P}7u-oK?#xht(a_sJN7J^NaG?&KfmQfiHH9+L?SHu;Bh=L|{osdO;~ zmY^&R^lciqn#m?V)@|#%+ceO&Io(z-u!9`VcPZS*W(#|ojx9P%g0Hp8anW)y?_IBm zbo(tsPm*yWw21AbPW=*|NbRDZJYAXnv|CD= zD`s;dNi5tTjQz}Y%|x0&F>FwVNZiQrk;w-K*A(lU84CH&@~&}Oc46(Kx3_tW&fhld ze5gOIQN6U4uO4@IB%TI%$I_zrW3gEw#yH8S=0YFl11frY)tQ(=i?NSFoFa=b7v!D_H%B{v}iu2+)xbnAd!0vtM)vzsND8*l$z#7f1ONi^a zmKf1MT(@s(&57|VJN^JujK<}lS6)$e?x^Kpj?0>*9))hMT8ZKl7k4K;z3LTYa^e0^ z)cZ@Mf&P0-YlZZeKhCu-G{|od$zb?=WJlD0`l?&ydWpN3?;LRj!Q(w^Z^LoUR`O(q z0+J1>Bb+r*(zKa1D~DH^7s)>|pW^x&*^Yc$gtS=R6(lyt<$nr_lhVj>GSAyI>7*0C znG7mcH5~Cp#n3ufX%q(|cjH&AV4~Q;&`?6Gxz4nX(ew9@H?HO-t zky>7&#^v0j6Vx7m3cD@ysg0#YF8E(Go^kJ7@E{(QW_!adNQd}qc~jKF=GEJ(+^Q;u zMo*ibhM~6>@XQ|JcdBv}^fg;b7ZS1E42=lMGGjP2=?lp&##LN(^{8sjz=^dRkG4kx zPcz53<}i!37tnE6t*4CJBKfe-3k7b)da{2ES~;bV-*3Q!6Yv! zk=UeTpkvNy?Qa4v-wJTXCE@3e@yYI>Ms3k!T9YAB;8l)yiCCPhsg|YKv z@TgJ@j19xvcN9E?w(KeoPPBsKa)3B1-xX;Hb4M~T9RTOA6*Er7N+TTSBWdUJs-AS{ zK1b)*86TB67?_C)VE0gel?+6)UC85Pt|Z5E$sdh97cu<1Nj=AEs`0CmQGWwSw+{mB ziVzNdW9vn<3lC-5+~ASd6wu#4HD+o2;H|R^dgOJeb`+D2DsEMZX5mjO)}UrAerj?> zv}E=4q<)_D(u5+%xyCwGrBi}?3at@5`&E%6O&Em~l21HTOcH$!7#bvj42^|7X+^|u z8*EG*_NytSKovH;l3{Y$9Z#V4qT&+PWzrQ%RQ<(0tCh)LLw1>*-h7cd#}sOS^C75h zCrD=9EM`)@mA|b|c?^OSZR|%)y{kW!Jv;QRVw91d%;@flzn3#0h`7j)Jt~=V9Q(_V zt;PZCRiwR|;$bwKNl~;6bQLId-R1-YkDCXjE=wZrTM?b(%s{P*hW65VBHs*qm;Hf~ zX=AzbQdgCLbCXo#x|#v8U6gkO)W$C4v6ixJJ0?~GzDG348NTY0MtatQ-Y1rP>qw^` zil59=3)s2dBa$}sEx3P;Yhe@`GcFjvw=|;M88!x0&v!gls#+wGf2_}anCVnyjlA4# zAdgXjPWlCj77NWez=rzVm+G-i^L|x*M1Rl67RPI-kP%$xkGShC5vxyac(@Z zwWG!cB{{VOT(}c2c+sNaLb`;63?nH0oNDC42vuASsDYo`Qx<%rN3@YsH z7$+HHpTe2ukNIZ6x2rBiXWAf=+?Gk2W&SxE<@_s3IE=CE11_U+z^rAXDFiHKkgLm< z>(YjT-J}xANh|kMNsfM%EGQLQa0{Ne<9F24AhJoO^OfdHg>RJq00Vv%DTR#tr!dCL z3IGdk%X3o0ZzkaC=jI&Zhs4udD=LX2`^rwxewpu1ORpl;q;bYR=shc4S|O_oJWV8i zOlfe4zchi|Z2T}PjK)}%hQ{(TGChA9hHXWpRr2LvJ$hB- z!!UW54eh|DLu}**<7hmSQe3+EDmvl1=B4Tu8#Tt?kpLO=sH10hVe=ew#WH5e!i;g8 z_2QRi*w84>dYUW*j8!4S1LkM1b5O0w<7vf1aKVn$1SsrHMv=CA4*4|F70V1d;kx?N z$b=&8BR;;g$fNTd0+M?0GHXIORa1q*_oXLcq9mD>+`59>R|g$w4(~ZxxESaKO3KQg zcbw3X%ONBBLWO0|Oyqjf1u=$F>K*wRY@cc_G9uB$9AM?U zS6AWY{{V?(PxrQ#M(4G1@<1coxNtjHN8v_Ubh*B7^>ScSTcYEz-w^~$7%ovkMU!U- z9e@2*jwA}XNrQu34!IjkqTD(s`leY9?@Zv=Gaa(|SOAXXdef-2A$1!h(nuJg181#9 zjz}#t?>@C;K#{iXp>hB|^w@1;i+YC2o`BTYb`m6c5uCo&QH%jdjL|XJyRtrFDdD3V zg1KFSNG`!Ew771ZTZF2xxL$gs4@so+{F`B>C)5<(tXljbCJrIKc66C|XW6e;xM+NmHR z5?rxVZO5}v-j=5W5`of zqjHfhp{pG>)nRWf@0fkq`VV@_j%$srHOd8J{6(_Ht8C=WG0UD<`qB%DE+=F$kc*9{ z*0xH--s6gy-#M$0zz}7Mg&j$!LkgmAauqN4(pyrUo=`=qADjKJYUp*{Sv2c093g!7U9qS4N3YOTi;3-YT{8m<6k`g#*#7|a z>w~(wn)>PPCOKyOlzLDhn~O1bC7kj}?r5aNC^P~SIOi01%@moY0+UTAjwuPGCV>WX z?@83sW`W7?Ok)0&QfG`&obx~m4LXogjo zUnlL5`gvd0rO-}3)9<(X)LQNh^hdah`qaK38=B|nzt*Sh5A6|a?n_(ev1-f*JoT+_ z5TMy^f7?IGvJ_!!T5sJZTY<(p8f7@1DWu}8fe5DJmp!QnpaefkN40YxwBEFIGyvdf!Ks~jqL3C5>T0MBO&jKr$$`a4r<#NiO*|8tLnR|3kc?0o zmmE+A_J^sSEv0Yw38{6RH_czQTaTE1^%c!46p%8U5zyCHq-slb7ndeCkJf=4wsYE? z4Qp#UXW8G+gl0D7vD&gJ6qvM>l(Ycsie3#g98eDwfOQzn9qB~_BgGqY)}XUhy~b*Uv6$Gh=QS*Id4p?k)i-Fp%SQ!Do`-@xYXeZz{>z!Ip;m_< zGAGJc+*eByua>#%iWx@iU{&cOJyFI+sWSZDXW>q0t<oo0$v{UqPS_P6*@HfMjCQHimO-_69CfAG?fg6rO+|`q zg`)X*7{_r{q3QgG!uZa>zqx{oV%aqXUJ3zGbseDy3T5PDv_`AWJ!{akWp>c)V?vRYwn2hJRV2H~C6p_vqsEQcqD?CZl1#+Xtd{%qeffsN*++En{f{I&HI znAOon{`NZ!HcP2Nl(YWsHeOk~3{^{88^?y$X*UlrZ=Ms>6V7X8kQ@`VlfSX}0K zCG#1j=gi6ZsJ4Plre0PatU_#~^pt30o}J$~q@YQ^qzw~%=o2k|HA$DAyj8{MD_EY^0&fFb67iAqWACrwT7hM@wM#o|Yh@&p=aNgcRnTnG?^Q9G znlusR%C6!CbUL87ut5x7bL4R&{{R=^SX7_A${f8+*y4)TIKWV4kNES+A4)H-VHWH2 zW=G^Npr~a=FwY``@(sj_cAji*cZVAarsk0KKEkH`MPoKpXyAxYR9`PF-yq;`!bO_=Xgrte^BG*Wp;nO;6)^fX*=n{#LH9Fa_l38Zom0*rE* zI5d)^sVgJ=qyj!7-zPuHrAteRqG=Ix8lHK@ELJj4D#9Zu%zD27RjUrn ziqhU|*@O_L=Wc^=AI`cM^#1^|+ec?3d9h5zUO^d2BX6#G6`O5w9+`V2F<8ZMbN6Fu z4p@7Paa{~T;!ToW{Dnu$5I?D`>NfV+S1DTg+GT~!t-Nv>svZRQ#<*?kiV^MQv9Z6sbP-QJhfvX z&g}H4ZEU2!l+C#k9N_Xd9Cj45F7|eijBfd39qPrTV(v^wfS{Gx>T8`gG(@_1d%f-$ z#HT)HGIQxu-ra+2p>nxxy9XS5)YlfN5|nv)Uz-a3KMKvd+i@^*%vpPZjMGlzAeFcv za^AHigi=p(2;G1>2F7YBVj*%Gnf95jBWUBuFVy$NQWecNnE`NFh*QL46qmPRNWn<9 zrr($p92(1ou)xV71J|x9bEqJ=XyIVd{{RbZ%YIbS0-IdyW?hO```(oV*9oz99JhRM zMM-kY9NL42s5(g5-hLtX$m9bF?YGIcQjhaRt)(iVS4)f_bY+dc|Z&&fVWDkb6}ddv}?B zcMHx*#&J@jEJ5Zwwg$i?4u1-XB-oQ=@v=uDX(Wpsj_k1ZsqU_An#S5WHoVKYmH~kt zn8)?1i!H0nk_dNT5(}XIT94sg*y&Z8QRS{ru=J}Xx5u-oHBI+PxfdL9 znrQ(u#K_7QCo7Oaq!S+{kVZp~y^eA3O^KaX%5RwU&!twaY#>DLDC!GC8*bBVnA?XiJtm z7!UV`MKpv$H@S0@mmE__3ldKv&L5^}>2T&eMYrZTAo|rmHAICFVLJkIic-Nr0f@(1 zu2B~Wm&-nIWi9+Zl;{ErRkh&f zPcr%lz|3eH9nWgP0!AAbm94J^tSPQCD{~}dXVi)kE;iWcFKwffOS(jlmxa{xp4qM% z88+_Tc>r_xSD;(M@i|!}EgO8hPhp;G&Si`2x0Ax`SjQu&Uru<~t7{57UcHPRVvk>XxYm^iM=_WC_O*9ZW>zzbBB${le$8f;r(MG|xs zk0kHqpE=85?%aQsQb}dFe>zT-f>gVd$DD*bfOsETy@Wm~q`ku;W;4+AtxIch8kds_ z2Yk_OZX#0cfI5V2i{{)8F2!r~|zLaZbiNPy*wMF^WG* zUX%d6sifdiGh&#>y)XjQ6HTQQ0Dd&maY65jVI=T7@rnR^NhQqb3@N$X?NBSav9g0k zv1Q2IM(JcewNpUS2Au%8oqy6~9Y44|KDCYFJ!9=M8z>twA^qV8?>D6~Zc^2?N7U}K z^9Wan)c*iFq7M|xc4z^8sLvGIQU+;^Gmg}Z*`S(j0u1p@EJYzShm%YN9dl04r86|_ z&@iWP1vFE!K!6Z>Q@WaH^q?9BJYtY@MI{0rI?_>^C>RP;^q?9_1TK2eIp&IAY5;a~ zM|w<9C;|DVnlp+_^FRUX-j1e>=86D2&_-!J%_yb;OGN^J4ARk>OmW2^96D0>ri1xY zyHGKE(MEXbNzNz`qMh29F;3_XDS)J!I&`M0^OAF#aHInFr>Us^l_K<@E2a!!Q zP)#5k+qEYqmm-pw4(cflAk8mo05jf<^GVu@08lC1Q#wM;ZEq zLL~DAJwX$_Z#K^0C%-RnhF=SG}85?parEA+E7PIKs%aH38V)U;AjD) zQ&?bA^GrO@0%M9w2{e=dxTMV(q&~C&ql!~a6tn=fA{vYN(t|)2m8w8^s>;}@RjCaV z1I;)!N19N1pk#-hY4A?=(+5r%8jLk1NLcM`uA$Vd@ade4FwJvXo}T)B!N4XbKOz2= z4TbC3+y`H~{6p5d##l8Q9jE4283cBw7C6kFMkz*d?^9jcJ(a?kzgX_RJwB?g`d+Hx{`A4-5XuN0u-m4|W9jCH8n=}OH}=e+_Jtl3??vwFw; zLaStRpTe4ZcOB>%()^Rg9S9)R387_e-jzbm?d@dsMm()=4=&;m=yw7tT1;t)Yjhn# zHw{+?9dLgFD;8npIBb#66?)F>Gb%a3so0{yMn(e|ti4M^j^)kFDyehakK_8*>+NO9 z>FZVG8&3wV_e5Evj%gVrF|cK2;D+_5fb23{06DGgM^29YLjn?X`>&3^#<@9KXl`b- zlsrzI&wr(5DA>|2;8!JYUZ2=qR&uouM*+~ zSBmH?FV^`$bqe{%-WY1uGwx+57VwNcY9>`)hNWdG{2zrxNMbN5$PhC6(-~0xX<10* z3XDnfV5z2p;kh?+)`r;KdG?@?J5yFgMkFRP-!z&UVg-qMCenKGPr_q?N@XXac zfg==#BLoM8%_qpi80Lipwnrb04z4|J*OT*c&2=`$vq<)y^ega=K8F4O3JX>y3pxzeuI7fZMr zCW(Pl!Q4;1QDq~7E3ejLj>Av2vy8DuJO+Klaq4TImNY0x1R9q0G>mj6@XHvkE?q$- zmu~~oqP^6mxw$JMunY34w-~4CapuNV^ByxKoktn=%}8ymty2@{CxyW*`TAB8UiHDl zsago3nC&yR?11RbR1u$AbLr6u8Yo;DH*EvV>b|4<)cd#$pfql-;Be2u0;|NyYi?0r zU?=4r)cS)>F-u~^DFiu>E0yE#bDEXBmXsF|oulRRe(0>rDV`hODXrE+CoMNaT2jK3 zDd%a%d8H+(nkwDh$1G+tvF`h$%N$iZScI1_LV%sjI5i)d4&j`U-k}#8i9i_psm3cv z$4v*+>x?nOG^RM(QE=*qcifD9DU;ikfX15+=indi`ikeHznP9!SrqhdO3{W#8ezPe zbrLJ=MGMFuO2x)V)jbDKq|FS;G*`tO@xrJ$_Ul#>bgyp*@dEbo6#2#UC?i>}y|7yqYN6 z%S@sDDy%lvs=-^#nfjf*1zj_yV`wl2K5Pz^b7|;E6?S%MiBD@E3uGCF5J71= zrlYOG z<%!uq3&v`vnG|4@7R4U$srmgLjzZJzF3selJk)fkU)3ObXD*OW!1 zTP6nSUa7k)A}?|an*$xGm0cbsJ6MM0Ptu!fY@_D|TfbU{ONg#!^KfHkA23tFt1-sR z!J`>}?w<86*=jae^JDcn_7z@K68+x3^)g$WZn%^qA8r<_@7UyLI6Z1m(-XW*3!ibJnG=F2x)xXmIOxu0bT^BP%!5ykU1M_+1Lq zl;mcd@TJA7N-`Nv2nL!HMSDUUA(#W{inS`0xQCyRilybaY>pbPPoFB?{-TR&2$p;d zeE3elRas;!koYU_Lr7b%YI#JIsg&V;ah%a`xg>J#K+EPTfsM)0YIF{{JB~ZI{{Z90mPupU^Zb&hm`2f%J*y_$Wuefz$ss}*dx6D0({H9v8O2qwj(dkb zQwSpEnoJ(Pf~;>UpD6PjcXq(~nwO&n9$SPei^=Pbw35KmL$wJVHwNbvH1ehwoIgx- z%_O$ZD(^`hW%UMURW?VB^q}*0j`D7T!^q_69+_<|pROR=bDExR=Z2LvO;4fYWVcMzxz{ zDr9EP?ETMr4aL+K4)SdRWy2BCg;Y~QlN8KR%c{J2_2_9g+RFKpayjZ_0A{PoN)MUx z3F3q*esH0i(AAN2SqW+s6?xkSB$NPuI+{tNI2&6e_7z{v0QpA?o(UAiS7m2IyhqfI z)QY)0QY0>Ru70%D-z3L;fJf*Ss)jyhLmq0%ZtJ*p?@qvrF~opyGJWY-U?me0wN#3gW0T2uEJ%-yxv2#a zt!8JqP>4}_Av|WQ{hfm{I0GD2w{X^hSoh~WYP%3uC2{!EM2H8K@@FM`nt_zI?D8|# zp3DSnf4kGxm(C_Jk4k7mP$6g7I33!b8hN<`2Ln9&RZ}6|fN&35mflkgo59KuRh0Lp z(1|m(gAP8GVmS$6zbE?DXvr|_2XDWnJ)9v`-Hh-E6ajUdkXb=vo`BV#4ouK^j@kUp zqm}aebJnsNQmeEa;-9D9m~~r593MGz^vw-uRCPU0OhSTC6&3K1^J;I0Kq# zT)S9x_MsM+0fN~>$^1h&ta)U4_f2s4QR`9J9qiXvJB+Q))6r>@NxnnB8(FvlwCF9x zm(G)F!lPz?y^bqV(%B%0zn8H`%y2oT*;}|~+?!)Toy7I1?{2)~<;HOk>VLgbP?uN6 zf08_ihwD%SOdM0Qieep~T5uy37ELa5KoVynhHfc@;+jnWDLMC~&UmKgj!B>e?Ma#q zF`EZUWLHhBYH{gynNjnw z+Y*1C)WlQqCb9NNe#r}d{z3B*{0sgy#(|EMvPmVxtt_lr*yNK-OaM?lXuzZLpe_K= zX$j_#^U{D0aY$N_oDS5$#FLOmy#f@U!k7H!l9#U(#xDkr-n4?0aXn&a*QE!@bDxlzi79UBK3 zuSM}5q;%a5d1Gy}IP&BAw;B8gHRg%H;*bwM=8S>#qK+sS+5w=R^k)Z}0AIp@aoU5L zKzXJEseLKA9G!JolkeNcM~6r`q)QMO9iwZ2G$Teyh{C8L-Jq064H#V$5MdjQN+W_O z(o!R(q!AF1mi)f^y?<{1JpAUfG%5K~Tn( zELX?cyeeZP+2Q65J+cADgi3qD!xa6)_F)0aSfzZm#=~Ji!utos6eLJDV=}e$F7)=v zyB4FC{{a4N{^93!s}5R({QMPlQPTvkl>MuFUlvtCy;&Gmp6U36Z#3LJYGX z+7T@F`0u&hmw$Ap*#7|HUthvsYV#hIm@S#yMk)m9;r@k2B8qheqn}>f%H&tjD>HaIw}-_ zfTBuJL_6M;cr$%mLl6jnnJSba0d&itZs0_Z&~*io6~;Dgskkv-gjP3v|1&7;tX8HU zUchrt-nnl7XRGeCNa z5!u%;t`lA==mFBcaLk=Z(bkp_iw7VyFdRrBqV8Iw1Saa^VO3+g4&Kti!igTu)M*?A zI9?I)Z#7@(*n}LLVq}HNs<=U}!L!y|etV$s;WU#bUwOtTyrP`YIXH5Xm>fs@*B?LM z1A9}t?4k+8YY5#>0WktEfApk!t{2_%fwh#SAnI_(NipRRI?U65VeGGP569u@uRsclyRi9Hqt;u!p zbdUekc-J!E1TzXGniT0%ahyn>g1J-6S;yK~<@r1ph&b8Hx9pg8GO&ImJI|9tQ;hxnWoluE^^=`xc#8~PowO43ItCeWJ1=zYs;Tkqm z;Z?Vfe+xs4q6Wc1X2|=<_ebI)V133}%+YdwqfRZqlGLO6=)G}v2b#`(uSzwTiAWL=i1O$GIBb6yXmaA26-XG+h60fyg#<6@TRUhe4u`H+CSU}gc5=J zHTctnr(d>x5w6t1S7dalWZeHKDz;sJkT65hnG-_8qJW5tnboPU~VlM{aSo+jV` znKU`IiQ-!-bvK`#X!XxwDhS<%0maGrNoJ7Qeaya~crbV1YWg_C>q=6+0BHAlR4;C+ zh!ThqYywybF9!+aKU?@ZSn0TMTA9z&zXg2xnl3k5aX8|7C=uRk^!&p0BksbhVARM8 zR|lDL6-h0-b7>`e#a-WO+p{yghJn_K7*puQy{W#Q;-9V;;a;qH9Wg$cWp{sEG}!8J zjU8@wY0f3pdS2~oG;C;|-`-3b|7Lwb&-1LRlA1@lafn0zKY*CMTk$OC?z_(uj-ltc z)nAL8yDC5RlG)$Q?erbsAY4sZsi{l_1*C1Q$d_IUJIRgelV(xvd!4%ZI9>Zc4o#I@ z`3-#Bjd}k5_etl|x(MO$YT1?y8N+&nVatzoH*j52wye}LgJ_B?`v*T%0XC#LK|leQ z(h{+waU;A|6({^5S4UOusC?Iuv#@mK$A`4VICfe(?`m9HiHvUHqF|;zdEa0DP z3bh*&ERk|0C$f=8FMnpoKhAHPwcyxeG6|IXP&QcXIuiR4oV73K#?;zu!RSI7>&w}H zT6HAj|DmOKx!povP3Q#)<6$r7SGynY1x1OFxKdc@t( z&*!$Ve7A3@8YS{*-}w7sL)Ei;|16FPq#x3c^&#x>R29&nc}-ONqqvr5Gyb1mMHCBu z(muvQ>?;=Tf2@?h>j|4&YpkR@dnIs9a!#-a(x22O6Mz=}W5luE1|@6Y-4{&>qwXuO zwRt!CxBixjycM7bT_eNnJgvGy4#NCHv;(DKV*@Nv`GUJUH8Alf(FmUzPsKIgxcN6XUv&>kL;4thyorExcz(C2)&GLex$=hexaiN}U1#ZW;-%;JdffNONTvRzrBRQ}tS@LIdic|R=4 z>LGgu27g4QHiDyd&-+}(GCeq+r68SsK4dDYod(~rDvAt)?2{{*o)AlA^(vGNJ*@hb zv$2*Evy?aVOCa+1dBrEN*(aHR5n*UaU`3x8IxOPzrByw@A7G4}OtN1Oyl{!u2sEMt zp9s~{q)ZBFJX2uZe-2ZA7~0*Z0)y8dn}Hr9?2&7GWLCj(27wUkjjAE{tDGW5_=N1| zSNib81*I@}T#AM1LLLg1OlpvDC#EgcW#rG3WRm;ye!R15`@u~11I1l5hsHwpxs|Nv zWEPG*_=G<@>$EBM*$(rj<~&IoywZJd`*?3Aew}qbMw&8L*4e=i zC1a5FAAq*2cu|YUH@2#c%5OgX+RBXZQnC0Iz%f--Y!Vc#K3>k!7h$ zD_Mw-&vn^6WXw8;>{(d-#%Xw6rFlVFBZ*sMw<6!U2(6~I~j)#F=|==!5w1+WS)N@t7C$`{fzQ47_cs1p zgM9oq_%O{j#tJ-I&@R$X(qRLgj|^Ufe%t3iHur5!7izTNyiNN|UvmAbjSmW? zimaXH6Vu6e>+!E)cGVIFRtOp~7R!Z2ma~eGxIc*bd2R{8CK2ixR<0G_J4Ii^zHS@x zfk^44W>Ay=o?VR!JU$!1GE^Uhdf#S0P6nQDw3e|h;%k_aUkG}Cw=iTl_UkhSEoi&J zN;o#l69uOF`CCj(YDE*11o;XVaHD;SZ*TQlZb;N+B(r2;De;VF()(MXB(!Z~n|hIQVRk+83+s^~ z-v7pI^1AgWpU)fbDC$i-MXn=w{W9gO+N}5GsMOKVtDgs23EuQH;g+>j;YqLdn#jrA zAI~i_uCu+Go^yHT9Qv&Vr;qvc*&j!$E^){(hG*l92+iac=VERTpJG5mam5rsK*1iQ z+-q1|ss{ijF;)q|@)Mb5p#d<$g3uVD6jVaSfj3g-LyVcU(tR&U}%T9PvCQXWxN&q5Afbz;9Zme zHgrbX?^>wrpY?}o+q0@5_S*E1SKtWsc^}&$j-Ypm=DKG1lq&)}%9A`Y`*ntXD;rgb zMlnc6{}bAz761Y-oxI-6yO<#Aj?vH9)r|yEY(eM z4SanJ=`({Y!`#TDnco@&$-ke@Q=6nT8y4C^WiJGv?bpx%)=qm`6QA&tG}^}zu?9&T zk*D$Jm*`a$b+@w{Cs{tJzN@}V>Jz&++4c3go0`Nral=v^PgCCCPE_lZK+1;%D z2biuQIy%D*h~~*N+7CvBiRAh8H7TW&hn6+*u4dJeFrzj7cJ-8C$Y-pp*#Y{+gv=GdzMf<}Ck-4{vb(3+o0Vat3feQ0t>@9w>qVXXZ$xDLDrFo^D*UJW z!0m3!`kc&h@Mx3FuEAsx;oVakC=LSw%YFpzmo32Sp~@i?9PQ%~j3Cgzhb0+NukygI z(QHb`#uS2h^j(r#8Vd6m9BqeUxx_mRGndeUS?qh7&qxXbEsbY!vgBm1WcE@L5O$GePch9Q4fGULCkAl$kXp0 z=yGm!&JihiA~h*&pZVCPvsn>E=RDK+%1OER#38jZ0>>?`b8~ch&eb%Cekr4JkIHlz zb<(vo1OECSfIE2i_7Zh_IvT%h?E(+syr~e()M2qAZ@|f7Is;#R>ud3F(h1_lRr15o znQ_D9Z_q?wz#sf6aH)|%^Cm_}72>lD^_)Wqf$o|22I(m&6Hwm7aEs;{X}f+(Mf3B# z>Czr?DI}>6NPm^K5LpdY_b(In@92?p*6kLIEJf!G zxZ?~#z1QQJ;Z6D*D2;716-Q~moT?9Xyeplj;%h%EUq2L31wFRrFn)n&=If9LVgs}j z{6gHCsDmeUg%=enf$P3P^oPQi_`H-!HN$)7?n4CLJLMXpf8E0IWA9*go75VjA}Mgg zg4+qPz{b|aHFZaFRM=ynP;w=T#O!1LhI?xwooXd%{_-n#*FIA{-2Pb!mC}iA3M)gN z$Ehml&WCL)&nUbsySuJ3mN*L}UEH(+a{7FiMU~CsjXTC;ljV6qA^8!&MP*A?&EJ&L!AK+mV>m$&Q@FU z!=pV?_up>mcFKRHXbB%N{n+U7G(akLz21mfb*E#a_N!mLOCTRjR|c|GbfX_y0N*tm zARUu2sCTq*JRG6GH#8yZU(nlp<*){1(|K3s4n<5@HnB|m($cuo(J+T>*^0oG=LUI; zWsf7_S&a%+!7UB*3kRvi%1&Lt(65AAGN{MCR+2H(QP~4cPNn6OYLB#`dp<4fGw@er z(l_CH`2eXozdn3f(aWLG#7+DAK>&cNaNSQ@etl~D#Sa;szXz)J5cOdeg#`SF@TAMT zXK|MK*FRPDDve^Rc>yonn5+r)JYw+~Og}Zb9_4dst;9Svv)vO~oa32WR~^c2`4yNjVc^`$B5L+X%|IX`z0i_}~erG-vn^`>XH?>ouN&#%d7Kx< zD|76We{X(nz_-tbTB+@~h{5#GpOC~BGybtbwZE#WJ=y`4u5=CaB`9w1@?CB0_pHI< z=le%}gLbvf`+acZ72Xs_B1i<(5}|()zSg1tLj2T0s>jUQ#bpgS!Z%sXGI&fl=<>^j z_RyiwbuZHCbAnnrf-|wR)x-JsdWAnHT7E<#k}~2iK23kW_h8=BnB}nWtmdsm;pF%} zPsM>bf4=tYq>NJYbfnzN#*g1^0poSdby1%^nqG6g9IWRM+R?AB4SExA=_>R^IP0+? zG+8UTF+G+AbiYn<)d}5A%{y+RXvnC=1W?5;*Q*-ztxfp>ok8g zDj8{9LR>mLihBAAl$aDOu3F1Y_yz~ZtG(5VM7*sGM72Dwexj^oIY-T5IjNA55I8#S z=M?#Vgl>o6JeC2-Li1v>adx1mCWJoIFs4{C6f3vQnA($gf_-)Nlt(7Dy!4CFjj-yE z&kY<`XxkKD*GxDUUSX6NYY9_|e1M+++_4xD05wef;pi_f;b>kVkkqfw`XDsF;&(u% zhaz8`;fS-YCrZ+}&ou1^TQRHczDY-EA~}KX%F=w3x0R z*$nV{_*Em?N_T+c4BUVi(-J%o53WAxUsJcmDtjBI!P$TCoK#M0>Z(Kt+g0p6u+sLz zpYa@8OdADP8?So~LE=eiBZiIO7ro3R^JqO&LprKst^^f4?)RImT|5?z0h5uuAJ(e; zO$0vsx}wC@6_;H?8CvrvHARk+%b1!4!caNzkdYC z+bwB~GMczRw94`vPF$|{uR>*1s+~yxELZF37;@nBSCdv|1{YdQJZRa1nDpYj*%cZR zIc~puTEXAemYOya#ih+rQ|vU=?IiaM5*1!muS^S3*h0n({aK34$5{XDG7a(d$kyu^ z*k~V;JMlal_26suH8yeZkQ27Z{9G&yUWRvqD&E<5=AmjHGfGSCnLKaKI6o{zDN~oU zs#f?#_baEMgb2(pfwE9TCqj@y*;vLT&7ystt!z(N(Yx)uW<7H1!3Ri)9G&Mm)*1X0 zgYNkcaD9D$e)a@-Si9uS-x+#cEz2&fxp%&rDXn=RmhoF#ghHz1GN-%nbM3z-%NK1n zogw^g(Z+sIS1U{FOFUeZ{@E4|F>CEwT)>jMVHnTi_|H0mI4+f$#C;pNa#DRKA;a$Z z-Cysg+j(|@9tD6kmMXpW-)cu2j>&gk^_A2}vt$krjM`?bBlz8-Y~zx=DGci6`4`&( zzGSv7uMajtdjm}8x^rl{QqEK>TUuUN52UPJeTQ)-EICx({18-Ma=fmJ1}~t#MZ2>c z8G|%1t1%F|Yj;7CS~;|_J5f)Akr+}_`yqjaVhl>C72awa27tEF&$&W(^}f$&uXOPJ z$ne=`UVnND`47NwisRGjzJyGyTp8`CP4N7Viir?W|Bmka;mH&EW!J{!R@s*A_JgR{ z*>{cBtT?W>d)D+JGWT7Fxb$(n7HivvCjK5Q2-lhl&HxB5dLJu zKtd%AL1s`)$Dl@p*2=*N1hC5iByO}BLA9FZX)$e88x_G+_1F0s=8|?ZpYv_($r^dF zx4q*&txnV@B?|}H&mcHi=2qZv+ep;wsHD1tXULyc9Kt-*6d&G8Pf9$snDQ2XTolu+ zL-o5fhW`DRr?%$MJ?;A}ZT*6MVDO%7`@SoMp7^L2bX4E9)$cH->pM z3Cip)Z^^sK5ZsNBShNg;cH5XdpG-oLpu=F#P?LU!b1}0<>u_*BVO@3?0G#+*iX;aX zY@#M73dcz55mXpXAs}Tam>7{=8e?DNfaMgfLg&l!VxaBl@MaHQ!$LOl$2*|bH6St+ zs|uoNatu8S;Y|QtYkryx^{F-ZFsN|7n^-F*VkTEl`Rm3mcJC!lE#y zK860$Wgi#86eW&Wo8!5S7MdF}i^flxRPtWDCYG{|wVdCT$JW`P_L#xBpl-YJJduXu z^Eb7gVbN*X)BG7n#!@g1f2Rw({j0NzCZsH9cE%;rF$1deSc--n347PZ!PeOC4hMXW z$B|>Fqp`d5 z7gGV`s?P(y(F-pv7;Wd8G6am7o{^@y>L3bip)=HxuySId{w=Q4p?F-EdrT*I@Y7rZx1Mbh&8ws6nQ zmFaWg>7h8FS{t{)n*r7V58Y(euQ)O{O%obzox}BGt5@(g@QfKgImQ68zSdG}<&KmQ zmK5RXF1V*~X%$O^OzD+E$7hL4yALLoWiYU$UzfP05OyAQ6Hs^EIX-P`3V`^Et+}nU zi*JOsdB)>m8YlIJ68namy{4R+Vef)BqBzZhd9Jne;vCW7;~|c!0r7?CFFhx?yZ{j9mHV_t|JOc3 z$1xBg#!rhWJf-N+^LBM+Xj^Q-?N1Tg4;Fsn6Za$VYOG}tj7tV?FRL8s6ezG3c`qtp zf4Tzux2>GB^#Ypc!^JfCyi@}$>8-RFRjcz9fYSi9h9c?dC2R{NHEcRo9n(X43}M}3 zVp!btGjiUj2UYy^G!KaD3x8(bP7t-?Hkp!xtO|~k7OJ|oMYK~}wX!oR=a~E@Y%nRR zfQhTSI0ydj4EJSdUKUZ1;qBN1_2EUCek*aJT0=r24z;Wy-tUgf1_*39gVeueqyc+| z&+}I~)o@9XPEh!BnMaKx$-z#YmZg@T>nmJ2EGF8vsc355S1i#dUwo%| zUaimJ*OwDSqwF;-^PD7CQPxTJ=v}66iqtLRBE`xAKD!27r&+gC)2<~F*<{+a2lA?9 zi-cIc7LR#^)!(;ltvVh$tE8G|<53sVNHM!FT}|l2aB_esuf=g?Lj&u@0liTMIV+i_ znvA;V`dwAshHZ@Zf3a|$tJ-cn%ymj1qMdr#93GCe$^d$CW3Bc0Q*9Ck*;7y_ALlmO zBI!J!ckfHXP3I-c|Lw@TWJJSKl-9XCP7ZzI!HdylEF`s%=^ekuKQG}56-J`En=X`Q zf1vv8WX?fbX$~N-T0zemzt2crL2i7kkEmhOScSunR@vaz2VHsvMErM`a_q~8x&wAi z7Y?MnU^)l;r`6-SoS5_i7YTO3(b;`$KXY3_=rGuhsh(<`cC&JnV_cJ@cR8eAwlgo+ zY`9H~X_l+K4Yy8Q;&rgVub_Cs9*PUR0TQ{oAKv zzO;105j`$O^G>g|+h0cY6s})RNY1!fNlNiWxel}7J8;;eR+8h&LVgcR@;|5m z%XMh7+>8nBSOQJCufp)XC;asVFH10|fJ1a$oFUDrS~jKxM!D7jMH;c}DJ}$~bG$iy zLpUnB_)jXV1tdQ8P<6qfK@Zfr7tk)A3Tv z+62(vv_wW~hXO8Pjr11So0Ft$tk(EvAVXqk;TX#I!_{iF7Synoy%f8@?HB zM?>(TFQ*yQwrnlK(XDI2iiBgftv8(78Kv!)3w+OZpQi9$FX$p8^4ToOSRe+*zIme? zPQ`n4Hh{XFtbR4C;)Ydrbai!*(Lhyc~H_wvMI zB5Wrnao4n?mpoMPR}&9MebvpfmKy~!T_~%QtwivKh3rn|PsbYs7THfpk=7e6_Q{br z-LJUG$Eg0wvdcWPdRre3oH0fowmGazu+$BX42y~3p-5-Pubc_3??h5Qy3x(9JWkG4 z|3>FwV*?**7M774ds3Cvf7~L_NGw41D`oGbT2dqWuBa&&#v&xoiQX~o-~_&{7kp>v zBg@g+)#9)l%|~$TdzFc_S`r`ti2%7q*?MFV=m!z2ubp zJ~&8RGMCtK11Q&J>U`t;z=+>W+&`+wcdia|ne)^85$L-|RFM>uU~BH#-c~^aU-1n^ z3qJlyYU<9v{i3@v9AhT$N}RIZdKkgH1yTYaae~M@)LMN&yv!-E3+5ZTt8qCIP8Y5j z0O?l2qm*DlC1`+OX_i8d#2VzzKm6$d8lkvZs8B-2+9U2qpheJj{HEWZH2XD&Dx?M| zvB^yQFLtvT%~=SW2pCod(x|=4X45#u9^hk0IA~_fmWS@*ha=Zwu3EpG!EZ!zg-5>9 zyQGoqI+S5;1m2cwc)mRvj)d`AGbB}JlonZNB? z8JhYxcfUPPdpf}sHyd0+!A1@$z-~jjPD?Xm@zI>R+xl}cdK9kdK_z&nXbyE-U*MS3 zNjV9GToSQJ0q&kIDi#8O12I^RG2$!(C3DJE3=rUaRwj^wPZ=XV$M(o&a~lABSBvSI z2=3&85_peNJ~>pq+!n&e_6L4p`hv*b(Y`*-{7Js*^WqjQTQ`}?<0|`b=cwNgYX3#u zxQwYK-*^;b^e4-?Qq%Bt?j+}V4(-<51^<7gPm-;UyDwg_E`9iKTDV!~xsOAVPBGqV zi2Ui+WBQgMOT!i5IhP&UxC3OerEi-q9gpS1XZtPS2AMEw~@yMjxP)=#R>!1nJ!UNzqUi(&e=jgkMF^T;?MYM z10qEmKtW796$CCW-_46CTPMjha@0xmdSEH$o%13W6}2Bt<1pwXoZB%R-PO8$e3e!XZ6$2mO@w5x?peMLKS#Rq zjLg`j>DUBt`~)-1I?Njf%qIN&T(h|l*=1kAMB<2x{vM5Xbf_niutH$jnwiUfwkFnx zM~99mnet+YiaqS?l6dnyN#l1q8@t)1eY(9k^>BeIFcKdTaeJwgxfd3U;&;A}?0ue>htp@p6JHrJ#4>jg=Lb&L0v zb`L(vyt1xvfCfnZv=wQiS(jd;`?`3`#EAib$CET0(mKO1n`+ttE}u1-A37nJgJp_Z zft!aMYBqkr1vq~lnWL}u%E zYPuI1O}ET9!;$fr)X*E%3;j|1Hp5rOP^`U7q@R;D`Wbtjs{1B@#b0_XS?O1UJoo)I)Ki1F z*prp~eD&O%{`k#W-{HKLXR&zbyx&nhCh7+-g(N-TtB>mr8Ma4n@OoRtn+InRQp18j z<~9B(TT4{5$s3cn`7lj9eZuee$&1F0Q`wqV+#!sczi_gv%~QLD_G#f;a|6xRZko-E zcnaXcy-BNPnO3Gg02;y)qzG)aRF~f}YzoKUiKbqp+c5iVzb*D#;=D`)Y_$gc~7m0Cc}H>mx@yj(-wI~84wLA(DwS_e>d@mUF<6< zB+{nL4q^W1m!w})L$C0$c1(X&*1Y(y?Bz?YoWggL(=x&ytYMEe;-Yyjs5oB4S}Z4# zP>`0<#)Z}L(ujD3RD-`SKadzYt(cp(ds}EwRe9d}mAGME%kZfabA3yian98EV*mW? zyOXL-c8jK%2YCS~`~IlpxtiAcSC$qgg+~rXTSq5V{?MZW*!08X;BZE@&dI5MM38Xj z1tH(6y zoY0s`hpt}FYD72runKUo~#SBP@TUAWCNR6Nf$Z^ex$dg z?-U&Ve)vPqDKSo}x{fAFabKZUF==ePK81yXCndmiG`){g*qx!Q#o^jQnqUKyE&em} zjwX%$cr5E2s<+9xVE=Sg`@f>TLhr4)*sFNSn z#hX^9EWZfVZkL60#N&X%Ed2Dqd+b_&17@@)^wvBMSibfVB7L z2q&>EPn(w|$|q{4kk3+0LuD^9*R5N!g-sGGq8G0O*ki=TEX(u5)JzJJ@-mx{;u{YQ z<@s>hs!uXC#8vBb)thK;bFsRR%1B{z>IJUn0{i)7j2_a(ne|uK+iL>r7JGC( zl0BYkHa@DB+gd&+E|TU`xR~nV*r==xoM-7}qT&>8d@eS*6;1cvpk3^HlUNR&fYyrN zIih^yO`O6WI&SS}VX0FK}3gJ-q$-R@mbGB6zg1*KSZKIA{HA z*eabT<0bCGgp_NN|J5e?xtcBTC<(r3SIeKLV;0ZDNLXT336CGi9UEklnxkY)EJ%E~ zw0-HXTUk8HH`^|xO~OCx<`UW`7ft!)^t%UjuII}L&8d1I#8=&y=nEDH_?jM3 z=lA;t7%^0R;6KzdIKOkkO2U%tYaH&H{=L*8fz#v3VT$*1IDsGHb?3T)ulH9_OnHR1 zQ%Z8v2PGJFuCH93vV}odwb65@Z}LNW-TVhUvD-!ifYp5LQB=`g$_)1Bf#r9y+TBwa zJ>>3YAj$8sD)My*#5!3QpzRG~P*UhS53SSk$d;K-Oh)yfIav;ywQ_jQbWXc@CMg5|Vu*}gq$>do%?*o~e zl?iZ0S=K1$F(g{b!LR+&v<^>~Eq+&>oqeBJweoRw;NaekJ5k$F`YT@(54J`AEK!3_ znHYR&u6!7PB>upPkxC$kLHCC)ONpqk&F3n?rV;2OwmkR=`hx3@I}#Ns@_ZTk8M`s!dMx8IZGXApzh5LVq|l?Olr4 zZG8gDjJUWAp9f56!77*%A$`{tU^Grhz|*G&6cuHx9A{-f8uMQrLL8^K|J|{p*T68Y zqHVxtG?!Xqt6F0;!4O1H5oX4~t`}$Tk~qf5x8gh_XR0pn;_%qb>lfA`VgBgIw_=lT zgZafZ4s0S>CT{z%I{Q-m1i(j>kH&?STk1i7Q~X!<8=sLif;!`Ve}v;pcMU7g45M|> zS@>mc0J^7eR?clePJuQ?s2A~GjuE3ePdcCi3fR659_A*Y%{SAz`P86rbAqFv_zH%) z`%vPjgzOpw!J*W$m$d=FqTeGg2y=cvG>C${2olE$@w#V_T2hnN>%GtQ(paG>KEOAfZEoze}u2TtYmQ~HdXg>ee>5}3Amh*XWQdC zo!oI#3hLk6KyN0fg3M-vhb85RVj)X*qRnII}}#j*;d zpJE8RBZXb#VHvCfmpf6{B`kIX)S(Doxx^DKkEfur$43y|<+gzBm%K0eGM@{oNG`=* zOrK%hTjpMleR!Df5|JPHUcIU9xk$gawPI>RUwQb+iA(Wkf|J(rlcL64MR`bvGg-bE|)59~rqk?D+i>7|)zPYpc9#l#V9WeO!!8d#O{ikS6@ z20eI-^g1~Y*XL6VS9ZYwK{M2lME9tH(D#?#CZ1MoQ@`vQBmXwUF?OIWwnJfN`k9|& z?Qa$RkH2ag3a96_1m{owIcTf31d-k?6?d`y`?>Cj4PU?PTQEOL;k@eO$au>S>f>J> zt|wxiULb3G#jkrk;K+k!P`X~GASYto`Fc^+qjbQ%@&&>t6yTRcvFNMFlpyN5i0oV( z4C@Qu{usMwwqVm8!fHeRLXxmHS-AJ{hf9v3^!w4hF-yvgR+&U5r1+n8%RSi03w>bQ zcTwLuB}?ZRyLkS>pc)QOkD`8?eUez|N}UaBm9-^o+Q=*5Z1 zTF$hp(}S^1TUrB(=kugtLZ+ZmtNi*bnwiq?i9QZg8^{mwMD*{Dst1*xc+7#^mZP2E^jT>~^1PlHZ=b5gYF&+H^ zSQGQcq9x8n_(-X4Vn2MQ(O%z+v5$(gAV9y)+j3g za}uArnv_gA0eZ6Rp%G6#G8UdcklenoAeT%isrCTYDPW{{RaTQza0X{A6-8FH`re=S zc1{1R6+@*wt>!G&xyp)#lY6A4|D;!Wk8)N&eWu=R_L;Ye!`t2sE*uz=_wwC_s{-xf zj`9kleXHiqSVhXGV!F!OCs>)5(<*=4)C7PJd@9v#aNJlwn_gLWYgQe-eLhw<{q$wX zHNlJ~eioolT2meze73mo!Xn9iSMKkv`3uVfIo4#m9?Re%VjX*sAttzYG1*eZcP~cvRt3@7$a&>11);#}Q-sqed&) z2x%Rh<=mEfGij-R{jYO3mvb;G?y1ds?$6qXj|!!3Cg&pqYtDbsq@beMTC?~Kodv|5 z^$qq|70ZV_(wGZVu%8NxJ(!N=jv{T{Q%wYG<(gXKQ`@==sl!OTP17ncd8E&1BM={2 z6FE}SG(qXS+UA)dbo~=Ot8_Vh!gLf{i`HJ(m6KtP_gVq)3%qQNV-iuof%!-?TrYmH zB@3n22o-WrviMD3M>$8xPhh_r`G?2+0%o;yb&tt?+X*f2onPk$kymxiUYJg`2AsXN z7W^x#xzSc(amQOkZmR>K_-(Id`=4hqLN~z9hVAUWtr#v#t}H;w-d=EsQQ?BMBdb$eW6QQY1U2I#m_wA%6Hw?=y7mS+hdd>Cxod^dCT5sjmt?Z8=-&j5`uNF{++P?75 zO_gENzCAu{-`+KhplHSrtIJShyw7FZ-R==O#un}^oY#df8f6L}kO)@p*?t%%l_v-L zF4afHxtOUYCksw8lqGgT6qiw_I&-@&`KbB>*v%1m`@$8#CPWx8C15x=vTzPxb zPD8s6yxoqMwM{NuO@2&`tWSV-LNdU0FY6T7Ee2DnUqv@ZdL!09?9l%5fc++?U4ag+ z={*%QxS9&Hgcr$yxIRyeRS&csRVh*1$n|3aGzFG@UGMNLu=ovY1+eI|sbZga1i6@E z{tk`dABkdJ87~Mysr^lKPCnSuH1eE6wa4HAS%}+T%nQvFD^__y*VA%Q8b`p<;?Cmh zNA2lMi#dDHV>$UjcScXI*M5eqsZY?xKf7>A&SXQ1YqbI)A=}0@D!2>dqwuX5{U+;7 z4Yiy{Nsa0&=H7E9bVnZaKi8U6r84zd-l5iiTOs|^xQ@4$Gnn6h-zgXA=cqu%E*!DW@uUmU+Lq(2_^af#r-?q5>e zeDJTO6=pF74{CMbfDI|r=*R2C0lP5xyMqHQmUiQV^c1d13}eKNhfI2V-or|phF(g( zIw5{*U-DlgIToiRg@L&Hux=Do|6Wb(yM>NnfQobu>9LvDkoTzhtVO5oDPVth?@@vi zNlGj}otk_~kY}}2k%hzi@)Z21KAri!JzLXrAbyA*RNVaqc$cT4S=y+i=5-5%Tss8q zUBJ``X>_;aG&rdQUnyA4-M1W>S_?5V{`C20{^~|M8dE2&&rc~QLDFG?K2vsYY2#CE zpCsR>WO!XKev1jIdGjXvoPA_#@!Nwbhpr6Y#g6?avBF!_bWBc^d6k=0^EIRAYuZoG z>k$Y3UW=tFYSMH~BB=h40WZ#c&6v0PmWG`(ZIT3X5?vSkAD@>&&_U(c;W(g5Qk?KTG(Vr+jr0j3?94R^LX<3Vjy*rs~ zI;*m(!@iyeDyiTan}v;3Xv7OSZ0K?StV814McZgN5D-{WXl@)ZO@K$Yw~rB~y*egM zlC9-f`6dZlW4aXxH3Apg*C7`T3@Ud+8-I(1kO!Ppr2MG&(wn`Sm`5`NkMaJtPs;}& zx)m{fLk8q?Ley}u&$f4$#s%nyZb$7k> zB=rv>f4`TUlK9dX6t=IsFfw^?gvy!!hx<4C`-s$2Ut&K!c2qUW;JAOry~K)tpof)Y zDKpKMn%Iw^J1*G(gm^;spX+aLCa|n~gg~(%(iAH11KbxP3O~hSx`riMF~pm>XvJLv zRUwu?Flvex1N-0-jb23fgugowK={914#fb=2?KA3eC(yU20{7&I&2F;0K&F_4tTmH z%hHRu*i7%6s4R3%FQhRgoxWb`Qs1s344`)XqMv6pZ=TH)atl#KL5l&%&DypmRg~@Z zmTBvGfOR9$)qF;==KJF+`7eYP0{^J(ch#SWZch6TnxcyAJBh8oKb>rysEgVysB4(w=22;O@;i+U%Za8n~+c4T*DoSBPEKm1B5N+5I_kJJ^f;fGR z>xA2uvI|E{4+S!8>ra{zO(SLIV){!cXtgCg+;ae7kx&^JuP{tP z44N#SH07nUg(zb8pbzrnMw?g(ZkeLrEc-(JdwQU61Xy@_V8!vU0=Gd%>`_N`gOM}|Ul_CR4l%wv+7A+O`i0{qmbp{b@51iFWMN}e@igN_B zNLnpG0j@GWHh~C$kcZ*cH}h1?wE^mi<Grp&7J{(3Q2$zb$#lsunv zo8hS~lYdQ}(hPTnjcVM0)*$z%2<<21K_y_&6m7Vv7>d7; zVF3-Aku*doYYSScMj+l)(l|60{U(&HlT*BZX-4RFdkq5&0GAQC4CA{1Jy7@rQ6Km( z0?_dfOP-mX)!^f~zdbA<$#;lF>BM#S`bHIj4~=%h$H*7VAO3ieu(U6%$ajDoLRrN_ zeuvdQ55sSVuJoB{#}_N1+vO)u_(azKU^eimuG4mHvb;T~A!SyuT*asHQzAOy= zs&_3i-Ochu6tyy10jLON^M_hr&Lb4Qnz!usxIf)~)unPaXrq)tVA3!?!Rhn;j45Ok zpNQ`}i2V%@p|}pI~y3v6JOFs-q@>F|hMhyGcr8bQj z$`GxTITjcb=xRo^-Y21@0)gb2gK-!;!<=b4L4Q!E$-B z-rA?YQ*^wKabsUCUy6?y+Tzs7)(ItyhS6%Z}-oc-puO4Y+E%Ww_aUEBx>6 zY!yJl9=!V1_RD1ik6r~ff9s_o1&kb=JWgh(FyJ~OE-04z_rJi$j$2*R@@u>8Zh*3( zD_@@=?)Ry#rdS0aD4Q}VS)eaXv)+dTUC(QLtZT-5uT@uxKcEUJpM7F(6-}~}ix+EV z4CEZAik`E$S}DbqPtAlM60?8ynxSRPds{e0gls~zG9IAl`H0u@N+U$C?TR9D8z^Ig{d zRN^PEv$p?{quC&ch+kpkNl6-1zf#vF((Kx1pDoA!^n|0B^a%rCOa88|# zgGbuArqGT@2G4{Y)4bdJ6z#v`pM+wTAoSXXzfP(s!o3;0gVJW9PTlvj7$axYx^?PY z-?nB2Jt=;IF@hYWE`7ZsyaPH|cXVsre|96pIVTmshPixduylC4qIuw0&!Pw%9D{Nf z)wJkfn_?YzI9rsDU#)%`h&1rnqKSO%KkoOVGCl{` zQYpR`*GYF8%bacmnq(ZC{w_$4$OY#U=PtkN=Cc^8b2kTr2U2E}*0Ht@lFa!|;b1?@ ziv7xEp8Qo+{@hkpP&xv3^6ftU;dZtPwFz>oga~Une2d1ot=JJwmDICm!Qz5jL&!Ix z@-O3kGS`B=4{IxPcU(dg9S02V@&Ji1$>+Alos4Si>Gl7fhqaQjf${|KK?FlA`P$lu zhKMz~EIhW%);OEqa*ct@$R+H~q^t_dOpq`!UHRc-yJi>aS)=`G~_ zb@J(DP-bFUf=yUE-}tDW=Sullx9yDF_E{pNQgk@x%Lep~@xt5WBMvD;yIn9ri6?SJ zCk3W2rLgy&_iS7!w?!_@D$mqxUri8WC@EX0C1v-Fcc>3#lbqA9;}l7s78^^>Q}_r4 zZ<^Dc`C|OI#L=G0oR31|sEx~UQc1h!`nh>-b4SuST=>X<^!}8?N>0zYI)fnMuo*E( zL`W|wdz@)wXryN~}i-a+ex0j=7uoE^@8*f(j z^&{Fk*S#&ty`QsJsA8b25wls2Ech>rpW9u_tJY1+;&ORW6Zv07y+J)Euw}Q8P?x1n zp&2V^R#Y(8;gGzy4^uSfj=zz&W+`W^omEk6tH0rAH6Q9;LEsq%f`E9zTs@Vm#s4B z$v9#sjUq9MA&QL*>Uk5j^}x1A!Xuh_(|G70E~Zx{@?PnqL{ZNah9|JG=0%&aq}4(} zo2#BF^S(l(e=Ab9{yd*VmI3+0HY0$1&+f*lKtG-x92HbCvu3yLf=Oeg&%x_ z*Wu6Q^!dyD8VyqH`Thfypkn8$r_%dA2OmbCB{ap|*Z&;bXVRF|V0AVjaK;=4^s$cG zVCiuYfRVXmUC~ImAQ^Ub(-5HeSPKrbIw3u=wxpuc*in!P0bEC$hOm|&e>@L3go7j~ zRAmroxZ1j@oCF1Y*K9fqzBU0!>-*4t*q{xZaUwSaszJ)>1te;uB3k|5D7b_CYzW2@ zX7;NN#Am4rJ36ldmfF8kK-zSrtBa~4EBKICorc4+GJzsVELUI*Qt5wEo|MJhSRy(I zHHTd+xyD^g-JY}v`41Ecllgm0G*eCBn7erGuum%$p!_F#20+P~_V>!)S;@j@uFO7H z89%T(#w#^6Jzh&TRO{u9fPs9mx}&6GQe-a(Is`^5noSEQQuIpns_3ji7BZYXS#rTf zu{OrEh_$&SV?WOJ#T>592|!2=gFlJH4~c73>Hy9fO~lb^mJuJ+h$accrzLTQ2nBKi zO$!#B{js5F8h~WG;ip>RO8{wx&85FejQb5((+9Q;BN$N`i}jVE&0p91FU3Omr!)5J zaUsoLox}Ve@HYQadHWyega2{TWy{jV_u7BSmw77x)=s}~GcDdZ-KcRgn-|t$C&cvv zERLh0k9lEcC%W}7c8l)3om0NF|5kPC_o|7{qMv<4iR+}IG>?l5n8wY`_jGYAYA~Q{ zXJ#x5i%>Md_)*ZGueTo&U_lz=6`xrY?Ix15GBiD*yQ4JyHR~ zcxM%Tc^ZkLCtnRej5IO@nc9+GWcV}iV9ghE(dNr1xU7m~6AW@Ws#oZEF_8y2ZY-9J zj$y&>%-|%$XdDB`g`Bi&LBQKcerz^o@dx?at8K%tFvONQrP(IosS7>_y?$qrRxXuQ zGWVTrgp|o0#Yu2~sj=rO_N?mk`SM_wvHI1y_HsSNe!=hnh2ms*;Sc|JPIJz?4m1YK zypF;@(+A}~-K27K$8YEz|5_DP|MS&8V(5miei+C`_+nlMUAkz#+Tn|)1}CR^pq+AM zgnIdqRM5ey-d=Mw$c#lkFyC6yLSu=-%EyDIE_ zZvJ+lJgGD z1d@v^`J`(Z;c0+)eRW6AamJyfxho5!h zmMNsg{i?0yIg;O_P_aiQJT|rDKzd9{f$6PY8<#e2LbQ?XgXeTisPAs@(tfQdR@w4* zIiq>?DkDPVZpV;i>jP&$od=JsIPrE@Ob)D?O1}$F$2eSid`UC&T;=NTG`<}j&erJ8 z&$j*VJAGIWTwyqxZ=``};oX`1RoX*Ir>k)GlCrw|dCb^(tGB}!J{_lP<(~C2lvdB! zTlpE5vW$M_K^i@1tCl!!^iFvVc4QQuU*YRb;EtF%3wTC_dR8ESwHHq#Pkhd3z|ec- zJV<0^9MN%d>nO4cEOpadx(HT1stR0eHje5o#%LRLbKKofl(s3DR8)yIGh~@i2@EQ4 zPd5>X+H|nuT|=*kdbiLH#NdUDzFkcTHtnqNS1_IyyV1GnEma5`nxn(ZCo~UftjhaH zU+V`Q;)1++5}6%LTt7iKdsQNwpjlHmxdu&k^IPVMqX(uPoycBJe!x)`8Ne+K)5?3g zJbyOs?L1MIT`v7AUBW@6vcJv(E$FQVR5N90ZG7r8ULL<L;eS4dwiRR}zwEY>hBbtN67zY@fI6?9KHy#dsqK=~*_TMBf3^{m7b*%TA84!Kn%QBX9-9%{VFw~1VXSzTBIH;i6 z71BS6?%>r~XTNsCB)9-$sb%1jC3dj<^sDrrr4?w>-=xhut|GU4zpQ$Nr*}(?4Q!S& zn>W4{n@10-FE7?x-9J9pT={f!(8T5HQ6-lfMmNx(bS+3xaQ!s08exujZXo=yVRA8> zG<&2L|CK>X?NV(1<_Dy?`TqOB){rI8M6u5UEu}yi+baX#o=lUK0)P7Xj`u%++L!SC zh%KgM(T`X1*UGHb3qhU3u`x{%Q#r1^SC1z3y)IZyD^S#*^QjAe1bcSWY5a;5t36aM zZ*P!xmDysuY?Rb2}{%X<#> zRyK2Q2bH(xt4~&&eqXskZdPfDF!}K z@rEk-FEQ&$`k9+r ztex4{m?6c4HK^n(3`_n45nODU8Bk85A`t_!7q9OA2eNf)nF{zJWPXZf8Do*s`(e@Y zhUfCU|3&1kZPQYFul=J{LxNw{G>1W#WkIcKLCWg82k$L{mg)_{WGNdtyLU})bF<}* zeC;L{uW{L0D{cqa=4BqAk^G9XU%K_*Px?zPM}lqe+k`Q+UxZdwI6u+jcIF0+LHaW^5=I>%4qF8PBo9NOAgJKVFuzp z>8&2Pz}l6}fD_s}2dY3AS*|}8sr|@ZvB9&{!ic9(04(sdO0?Ltf)N94fErKcEuYj~ zds=i29HJlh%iVJDP$r9iki+0%eSkvDvT zI9ON-^6$HVQ&WUm6<2E?ohnXWzVY1^4YGWTrS^)kLj1{^QQn6uzk1DSJY!_KPL$}< zp9!IkhO`V+ix`joFfO<}wGR?n{A`cP{SVw;1!uwwrCIwas$9mh*~$^Hi^q z|IO;3efC#4_bG~u#y@FDM?n~5$}@MT|Bl~U&{vzcx7q*Zt%aH^$9jCi__r|ke-%6K zTDLLq)9e=ds?uI5gEjF9Y1KGZWnoGPk7acld*PL&GqnYE@%XY%xqPwnNvTje)ll@n zM6^yM$kXCnf}-mWwE$r7z=Z&Y)UUl7nuqxL12jc1NP>9P(&rHN2@9g}8*#F#6JIi? zWy$r0kxj6AyjBm%W;?17(j-Uc!nVLR%T4inv(Qrr5HGC;8-1FBH>U)x-TS2?!TJXs z-%-oRseCI;vsmKrq-HAbJW_dEaS7zcvK2=ia{cN+ROrFC(5(bXDx;Ebu9I&|@h27h z2CzfQTo!1W&5zd*fxe8M+-nX@X?B=-(m41R(NDD-uYFGPDWDdMSxmb?(&O}sJ+Y?2 z{vkLJ%5n(C&9L?n!X!VWo&$m4{@MUd9+O;vK_VoqLvNy+nh^6gKP}x1zBz%5-y~yK1~P&eCH?$;}v)xoSlK7UG@wd?6FK`#Fzuo_# zLP(#=en6REx=w4lT(`$Vo?HfsQ8TJC?v;Ej@%!zmV5#-wpmyEr>{i7^`T3@i{(m56 zl~IQ`f9Wzl`@6Evm|QL;NUoGG=rC;5>x>Uvu3y>@8isLSKy83ATIDYEahR17*XFi! zA+^wAZic_98!bWQkZ^+E#exib#Z6f!GWWH9gLX|p>)7Bm;48mp(6A{Dfhxq3Kq8P3 zYM#o?ZX8%Uw-ZUrxC>XsxsM;=hp9Df>C{#eS>BT^befeaIlA`Hw%Zn5y%J|*ZsUqW zw3WD{Bm6GhB}5grb!HcS2^g*-k+1S$Oc6KzgJK9nf)tj-wN6<-il47?ns0z02(m8F zk9qfSY|{Xw8cv|e*(1;byDQei@=h2(bheW72|;|_N(7C>=_7eE2z(McN+6XwAy~z0 z2!E#pg<`*{Vy{?CFCR0I#n#1d!eMf}JlN0{N$uP{_*E%i$ewl0gK=~i!bVWFBH1Px zC<&zyx3O3^Gn&Y};yh#GtMFCe?$Shul+>4b)Zz)1TL2B6(gSX|ay-HAv#{`~deV&Ee%$cvh*}wgHUyaNSgO zpbKw}_r$6~QnM3zRq^Gpmko4B$9Rxx1$f(H(-7#hyPi(} z;zJ&o@M|`?j#UkI$O`;MA26|30S(mJv$=LWjSSjK>y)SBriz?~)3WFyX3a}zL2|xbL0nYSd)_enKuJR=Cpd zz_T!u&(>OW7_#zijm4F-(l+#3JvA?rhO@@jh*Cnql*WAWh?h67YENibDo|@blHrMd z70UQV?XI$1wa);4Mb?<7o&UR}^j8JbKgv;To~TsKSnfxEOz$l}dzJoSZD`~J63m+< zbjR^qQ*J(+4s+tPnF zwkxh9Yr&i|&4J+gjqk~6zF>A{F#%0H`|W$5ooz06#y8FG^cXLq^U2)XLk_8Sw=A;s zLOmYVeG;Mzs|#t{{y86tWLZ>mA%UA7mSdxf{@yW$)p_IyN*etk!;fodk=r(<8u8&2 z><4OHL>ML+Ff$)PbAm;!UHRg@s2-rn0#8AQ-gci{bdHorgX;>cf~nVK_D5KSU~m`Qfl@v=*wF%B(HDl6OH zHGkoqZfqAf9goz==y~&<#{|lyPH3HCo!asU6ACb)aAAuxs&nY=&m2%UW6bI=pz8;M zyCRyD4`R#=XHAua=?Ya~wx@ynnHw(P%RkHUN_cMqCZ=zyU~fdzCdM3LK%04SeSQ6} zU8eJl%RsZEd?`N5At;<2tW2Mk_hHkp=s%FW^$apC>jkp=?zcg>_GDd9n*AIhm$XV2 zcf_oF83EdabcdQ2YkYG)Q7`d@!x!*hD}lUwdB^=n_^*Z$%ji4L*5)cZgCcf3`ZKlh4LA5_x8CszUYesYi?*waXGwf|0M-U~M3Hr!;r^`DpS$vu!Ie z7TaEdRGiJt{>x6Mc6-F(Y_!~?)}&&6-@mQBohQ&}*0QMP)nG4oADaC-j`G!mg22cp zoO~SDI~`Nj-o0a4rfclMe1A3Sl3tTN@ZzY~dq~Ozs8sPk(Nc7_2xDj&9QsI??H#M4 z`m&Q!qJ|1%U0Xm)YV_uOmm53L5W-6HHu(DW2Or-{!H#6{0ccDO7LJP~C73oUx17Wm%|8T+uh~1$8=7ov67qQ`+%=Gt#jqtx1SeZ za`4Y}+zWRtSe%M6%W+je}UjLz_$K7_fL~DIGP@F9AqChM8awwIX2MiK-96^vYa1T%C%KV1Q#l4*Tj^(akZ&x6U&z<% z0fD0ynffb>q^qqL;qK@tUI)61?GHb!9YJw%_Z@!jafdY}m{#U!WxzVq-M-(HIp(W@ z_8I7Pm0n%BY_3_%W7`~&=)N%@Q*QnwhPxGDe$t7Aq#` z+AvX%z%{T)*O>dU#gCgFdx}r1!O--4 zg65?kdnBK~HSH}bCTSEpqlhoI;RS8rX7P2Cxuip2Pky-9D;GA6j+9^nI?hHdq1sJM z`X(Jy8EeLVZ1_t^Svu%!tQhUdpl7IPY{_hfM9RJlZO~(}vi+^)$9~_CbkF|XX{7Wz`7>&2z5f0?Ijs&+jf`35^2}W$aUfsFdSKeSzs{xMG}1uR&m>;7%M@+3Ly_c zMM*%%khL0Neb>aY7bF$}{e)>sCc{YW;sETTk69f{16nLr1FEsr%DD*BS;mU+-tw1( zZb&Yj6GDC}2lD;|NA-m;B(dJBvH|GH=g{%7ALx+NbdU_+?_OEJJ}nAM!Z$KM*2yGCcCM^9kVIO%rA=Pm}`@#lkLSJ~#*77>zr{^j>}>gp3h; zyLcT&(7^_C4eo~DBL4&3JZJ3mV$BEF3F<%59)E(4g8Ujz1ce!y%29)s{-1$WC#|(X zQr;Q`1NCEOQ&=E<`Jo9AkRw%z$(IIHSWtv2gWqb&L?T-;_g!@a{{zxmGx&NPU*)+4 zEgIyqMBqO}%bUSLy&7q@3EWXmRJ0XNHWN93^C9W^VMU!HLCu0v^c?zWud>%zmA@%f zHEzSnZva+70Lvo9FU8D$L$K?#oQL(n0KzLXp`6s z<3;#?pdIwL;G>LRt2)C?P1<6>ReYmU60DOrtixU>cR|KXTkY3d%KvLVb9BD>Elo?8 zW1xcWqbZsxg1VoRcnQ1A9sI4xoxp90951nur$ge&Pa!=L6c!43Z!KuGb3r?YzI8Wb zfz<4#DSX#ros>j|rK)1aN9BzWP(TgtufmVO+Cc~kz?_2HGahGY$BlxKVM9Uy@7;Kg z9#`sD&I##*cy@9w=1?KnHtwxiY=6NC3@2rSGwM|1?Rqz@)?iFd4%^1jo|Qi}Qm70W2XF0)Ot985XV@2L)Ih#L_0v0>GPmd|-yv z=Y$gvwV*@AWRoFz3djt(>akDa)lRChzcz5t64rbaspUt}1Nd#uNTb(9`H=HV*r(jl z0UkMg1K;0`s#bpQtkxX@i!x(FDvfCHG&vMAWJ~098<7OIH5&gge0i(l#A0-92fMzU zF6-{+vMg86p3l+w%Ab(GMJks2m-b-vv5)_?o9>~Wg1I`V4(_^^4vo#=RwvHEM-O-p z>$RgrTk~FJKKyz{_H5@IAS}Tm*U>hnOIMqLu6RIFpPs^uw*yQeg5XtJO>#HU7mE_y zSEbDC2g)C?tiY@eQcofiZb-lq)kaK_Uj&mMsk2)SBk9kqX*x?LOCpU?f)~gGF~z?7 z?9NPF(ZbDJyVO0W!HG*p_`X7f-}6V{+z|rO$!x`dGlt{Y(rYE=K`KL4@pU`!`Ojs6 z*eCpbXd|-@j3hloev8?r`9uHCuRmh1eWPIH#+wLmW{81ZXQSD7fo{o*pVwcHPQ3Uf z7ru&9vrTqv=TXdjr=lE2H^(u!_g==abYVx7$!O?DX_#xojPRmXhau`y6h~4N@A3AE zrpW5=?`basEj*z;8(wOCaE7e}Q`+zqslp{{k@y%Nm zZ7&siNOhD-0cG-zWDqPz;Ze9VIr7$;yJ33 znPVD>MjZEe{9UTPtRI#8w<Uzi1$*Q0*9l;V6H58Y_5;<9K-Hc1IUVO#922r#N$G?zg{76)qF`eFXEGV>j} z$44{;e_m?3aq9^&$d4n~AdwY;{j`+-@7FbV6p9+rwSFONpB3+DTa0dQqUeXjZ~ghA zXws`NKBXv@w3@YScsjWTX0R~O{P&Xepl^QrG(X3w?h5}cSz^}JrYxq9zW8?RjTqWH`T$hVKiIrxC!Q&3KYR*yI5 znUP%@e;pe^W*6XahxGF^xjVOQ+XZvx)m^K#(t8$M@SQBZkwNFetIZb`)6iP=8FDfAa10iP~*$JJ5ufDzM`@%vU z_9VVXC#JRL!2`Pq-@)cv0oVGi)gK_7MB;Dd{k)lLR!e@>RJI*?E@>iugGEl!-0hK~ zbY0T9L`9BW8Ugctw-;*K)w$@9Bd#Q!Rg{|&mUM3TGR1b}ry!FN=Li?kWsXYQ1=Lg> z)358YUn91C)gGcN$3FZUw_vGO9aVE6a$IA@H1=7n@Qw3f^^fA7y3*8YRy&UJXo!#N zB=NB|wdG8u62ErdXb26Bys7SV_c^qLnWgugHKjmjDKQtkY20`sjoP-9xtIc}P)&D3 za)#ezJ*M5m_l1?xh8cUBKP*2WcwOh=cgBHY(WNr@Olm<5M}a!U7(P}few?X`2z%N; zTSIRBQmYDrVYU9zuzzGA(0#;m*lvXCF!S=s=P`0}+h9~SB8YrpLG{{*Seh+W;oa`9f2YK0|zj5_OO1ON} zMh=nH`YUtuUGn4qzNd~8BPMxM4u@{wre*&=TX^BeXI$jHgNxJ%Mz|hoEl)mgB!qU;$xib`KO3_GylK zDe0D|&C1?eh=!bQCx(7rQu5GxqNa~2Jgk3mtErW}w&>3EU)@V?nzTxoR2qxk@@2#* z5-Z>AcEE3>Cw0f3l17li`hBWiWGS_My;ka+Wj8|wSqUp_O~BT=3^!8VOX1D2d%t(n zKvk<65J#_#%&c2suKA4cARC2dCmBjsjjx9lGXZrxk+-j+mXGt=sc65|M931X*k-y}$lVUy zfe7C;E0q)riX9c&sDnLyN+`sDIA;9L0%E;iizw>ICJ=1?TxK2LPMPglbQBQOO1U!; zVb{*8D=3-)ku)^2zx7M^6=n1lqzog~7xP=4 zRK*gRLj1N({*o|&tCPW$ew&nb{@$9B1<5R-x@{o>qtmJ!;zPz-%j>Q+3u&c8w+(4+ zCz9xJR}oQoFyzBHlM5mv)^F1i&=qfzFci48W|M=*B=PZ)ojr8Poss7v%zfjA1b(8YcqdCRcVtTT4IUHxC%C+nhr51Uk z;2-$_ZNRyRrU4%jMmqT#jTIz7%M8{t2SY+4{u!}>*VxWU#Ry;;V{jP->GD-s!IFs* z{EJDL6CfuQLM?a#bRwmwmYMo3Ci+pJhGav=$965x!f_rX0MvH~EXapsRf%h){89-D z6xPOq@LPgy=l!U}0DuJF0$iwFI!^K0cLAF)chr&vR|O@MDS`79K=i+U=w7rMaG3T3 zZb@^W)?jh(9ZgkC%(Gf7v?4(qC)=*+ZQYZ`Q@L3F2dU%~@1Kg8Ezn zMRB~yU8{HZvu%4Lb~jxzUA*k9H|#C?ZCOoYh>`WlLKU0F7EmIYhW3bf)oMsNiDCdU ziZUyO5)&a~WKw>sskt@f8pc9!#GigK$(VX0n(*5~aNX1o1mb`dZVpjq^duz?OK2kY z(E6DYpmk%Um8oAJt13>{I2y*^#Ey28^&3X2QljC|O_Oz90QxOT3n2AsOj%(}k-Dpx z)Fe_?fyDACxZhvV!WJ6^Nwhg4(4vD-Q!f(1MotnqDiK zCWK!L`HAw?Ftb`LiSq6^*aWKxA0-y_l&T%!g)^D@9@v_EsbJr{ptLGIvm|09+1) zZJT5(PFO0W2rVZWVx(^=oSB9OZG;D!yYx5ityXrQR5iQG2fGbQT*27=5Mg>9tT4dC z_Vw(CH}Mx?wk>W_t9u(>i>l{WG{ zlgDd3A6SP=v(oY(`sAp=9{Ww-qensdSGVSxVJ{ZmMbxJDyAS!@{^iRyY?zQzg;{cC z08$Q7NzEBcmW3*m@Y;Q%au_aUN=cc9Cfn!T<3K?vRI&Vlc|M=P}xZ@nJ-0sDlWecG3mv@iZ8?|5#zN>zWU zeLaX>+dVu$3T9uEPK%+sm>LY(a`BGi(%V7#qMpii=l^}OW>>lto3m1#v+hhYL7zX& z(9w08_pImNgqNJQbGUm#M{@Cmck`rB+Y4A&VwCJC!(qkkFImt#{Kh{&AdK`R2R{|e z406b=I;VcVAU=zynGt?VCsa{0T38ZB?3mTw*1^UFsyKXULB zRSc&Aj%9Mnire;-;yL`hg|u^eZM~vQCE|~J2>GIsHeF|S8cVay%f`bhk~*=CsLIoc ztZA{&;k$2NFPXl1%fV8PG1Njn5}`4BIxoiWeVlR~Skk;ok zgiyn4YSy85MUtPCGSM307%w$h9UDE5Y$G!&+OM*hJ5GdK{hal_RuiRU_O?^0SjNr! zy)!wW`*z>%xOb|Zkt0tA2u8k|MDpg(s%d9FYU|WT?Nj}j&7UKOT~ z(Y|wu%(xR+>Ag-3lnB8PP)L`8UqYCe^hH-Q-Y!QFC^{~4_juE^9jnIij?oMaT1Iuy z)@&n(gExtBGj=IAMx>#dEwrh<6sQbjo2S~jf6cA27KRqBjpLv4pU|G7g_~GuYPjPB zc3K^%W^4E(PyIx|v_48I&^@D0gU;Fx!-6#Tk5x8M`RY!F#DqprS&!E1OM=v>GHmR9 zdWc)T&G$5V_VxaKHyQip8>zM<$t@{O$bN7cQ4+nDr}_O%+`~B`T(UQC~k7k60bGiNW(OQ?SUubfxSj~M+S5bD^Y=C_W~TC{cD?l?XZHbeY@c_TVdz*=RHw5BD?lM>Hq7g zBz^@My&SqTEVTX;W(*0)C4b}3l2(OnOd}v z9ZyeSq<=<3U%B;If53k5<4t|4|^(GMH33^U{{FCSV}aJ_*Phx@U*U6voM*6+QXx3g_65zZ;g-$~mu zzjfV8bbJ5uL+;=5<6c1it19!7&kyUmKr$iy<})6YN-3BnOBWR@Yj`{(-hZaTfBmmr zB#{cVQ{l#EhE}v3_7gQfH}PYmu%C@Ftz?;pLMI{|u%n0a_q|S56`+qc4r5B%ieG7V$EJvzLq5OvxZ`VhR}V5|3~K{u;ip zjs{@T>1ZH;X>v4CX2qkFdqz&wH(UH9BIUIFfc77C#0g%I$`>s^QU_sh<8N9Z6;0)< z0)#gUzEgrY+Oo4!6oLG?R+^hA(3e|$3`n9)wh6;f_z{j;pw0x63I#8T6+kV(Hxv8^ zX@1EtBchOa*)=LMD!FV;cL10Q0e*wWi_B5VH8gVV12oJy77S0WJj7EZ&0DL&e1WC; zFIYRiSC#1m=VrwliZpUEXSg?!2w^7yNGy2(p-jy?3RUb?Pf9!*;sRzp)f0SwEa>hG ziG96S0yj#LhFCr(V&2=f+fEAS2TRE=Zk1Cyx|3m6D&xoN?R zI6&)NhZM)dIwj5}I6G-1@WTcWrn|Z9#wHN_8ZQF$TSAmu|3@7kG%vO=H^h5~-5 z2)w6Jm@XI?$q_!eXv+_|UBHsoypLM4RL+mms;`)Z7B*-N93QJbQ_822SPEg~s*8yb zCKic)QyT*iuMO55dygQ&qkn<;a1ziMq+Xi}4k&Mwl) zq!@{fa&qK+Y&W5j;Td{^OCs9)s1k)a#RnNsYD5Rcj|Hz9G=B=GX=sk)?XsAmPtkmb zLD?PF8Egens>v|^(Y?It8mnL>aFUDmn4CX5!J%@T)ISsl+>m`Eg=B9#3$&c{$Mg(zx)+(H=w!^ch`(yc&#p4Mt2}44ghFx5}8ZbcHWr|0p(IAar zZFnK5LYSdeC9ky{4oIn;TMJ(hcI*t~`^)VMUrGZ^Xr8)zjh&#=(39U5GN#uDz13hZ zPQtpSXitAM9@U8%h!CvXS_l7F+8YQeMOs7)4 zN(F!ar0t|ne}&H~7*V1crO!u-b5wUJYztSJ(d+j*EZ-?-*fh_ceLv~t(Y6*$WzYFJ z;~x%9Ya^lal7kF5HAXt#A_z3{o!J{2QGErW|}vBatM^gcKf{fg_-M8JX2Oy${E_ZfE$`QP*5IZ(q{)AT`gqt z#_zwcT+pHw!KS)y2z+Z3G&-j$E1D``d7my4E^ka2UyHs>Eq|Uc)c)*%w!+rzIGscp zQqG&%6j1c-VnBm(ZFT*xt?`p~r!abWq~x0fPObP?uTL9qwe<6t+}nxUt%EDkUomh% zb4Y!XciE{D+!uW|yAs@#00N_<7{vmo@5xeX&%{n7X0oYK$KB8NjA^ULG#c8^jPAXE z#{%XVP%ZzP*~NF}nNefC3(Fx89c+B_hM(>n>TpZi@gu4JcM+2&KeXsgto{)^v+E^QjlV4#T$H@}WJd770XO>HnJ)9z5^qTm1o{fXTqLJmHpwCMT~+Krbl zbywm|{&vZWMoe2mzY2a?H)r^UVdYJl%3oVrsWEcrKb$Sl3_rwKa7@x)N4k86mQQzo z;m@mTH)?J6zAGQ)Es%2S`7?Tk=$Y;iLu&6{zs;C-3J-iL;$vk^eDI5zOZ)g4=B=Ni zDk4)al?yEj^MzLp*%d68nLv?Ae|g6%Ppt10MWsEZ`y_PglF*DD%X{hGNEQ?@O`Osf z8i?}V;;N2?;8C(*ejQ0TEy~t}`w{31ETA`{;^h~EX-w`;l_^|=k6JSArj3Q1_?rMI z&1>K=CR@$T@er`Xz~}aD!RiI zUx*aOd9_@bX=hwj`SbcU>G`aq(1gU~qe$u6=o3NvS^tB7H_enQm$nhV?Fj1)gf8M{)#0PEBTWBcLp)93qEopw*2fC;yoViqtaiy5IeS&SEJq^`;? zVXXWIEq@DRfG`ZMB>$%V=%#J<@}di|3pQO>gG5( zGUf7$RaFHFCjaX6Z6@^7GHoff62=HNmjAC`xZnxYNAs}a+Vqb^4cj~vzY+ft%P=wX z)3-IuVRiiwIRkO<&Tozkuh-IjWV4LCi z*S`)=*x_Bw=gBZ%CEou0Q1bkjWp3hAwRskL;+M4)Fy`{Gt_{kiCZ1ykG5-(rsgD^( zb;IS&2a-mZW1tLo42`bM?p)IrtjY@m#aoOmNmjHS)%^_&p;_4`B~6wiD-^Q2>IxRv zD&X?#rpmWIbNY+tU6<7 z4^eArOW%D2Teiwi;SHy(tDnpd`vVYw9Gg?>0uMC%^fa>2gebN4{4qzM@MqRFW;bQQ zz+r?h49=K_GRt^KR{sMH`)e(9+CP!WyI8bt2?Asx&Mq9&ff?{>;=$63C7&T|LBFB& z+h$uxW{_Bz(TW+ID7Zc>X=ZlkX8jAge({flTVUgjr}RFmAV*~RdJS><;p9PCzUqYp zd+6|l6r5Mtt%nQ)z2yWKI@V*vAwh?O*9!IBHc08O;K1@AIShyojaKQhV6|7zjwF+p2pb#$JG0ga*%wjAJ=SV!%h%YvlgunnL3gvS-QU0_d$*0yrMnIT` zZ_f-P!8V`3K)0Cdk$JS3l&T;x1XCG8-)#bQR<3mdATT&%2UbDdHo?>;+mEet9Rz85 z?~#-ubAu0H%*Gx969dYE>x|r(17{q!eP;^?cLE44kDPUQc`HF2SaEJ0kW$xyt)n;= z1XT$^S^(@Uw97xT5ex?ac%hTq=9hMz)Rfoq34%a2yq^h^`$y3na*klzBcttCvFeBn zf0`#~+g>q0+8+cVY^=(iXNMeb{sV2CIVeB>l9#xi$ zG5ZX*uvA;cMlXRmNFK0^;i57p6mc<$u@8z5GfN{~MZ3Y!?iD9^w-_8aX(F@O zlQP-JRR4OC3D$-@hqbbSvr^e?TF_!`hd?qC+QcFtiBFH#7$a8^AYFk%r|-xaG(~fX zOxBhfq+?+5NxSM$(xoiGS*G#5D#n01))r)(X2Gtfamm8a<3Q) z&H&Y2%pqZB{V3+i<4b=k0keRnwSpURiJ-^IfB0u6oZy}vUL`5pRzX8>6BtLZ10?H- zM6RYa;g%)~v%%oE!$Sk7oMogp8n#w%R9YR!EYE9+jJ453IL*Km(&ThXU@HF3OioT}5YHcV zAR%4^DUyH#0qDB194taRHJ~(*zwau>nou5E4+*YG4+V z;EEpvHcM<@$&E^{zHs`qZ@H~0#ZK&H=_z`SjyFhKP%dAy=!&d9{uZ$6(LULqH)8tk z^{T9eBUbZbDxaSqK01N6F9cJvl8WJ)a2j&kB6T3DR^i~B-8BRdOMpp+piMod}j@5Lq1Oai$c@(nuYX8-{KAv~z z-tW?~ci|Ib(946MRU!&d>t=eqC_H)c+Ln_ZX{7_EN(HZL`!GSzZ(1L@$(IA|(RW(B zFC;%s9!0oTX7|y$g-U5VIA@EzY1NmzLq8Kk+o@pCnr+))wRWxRAYbCw+5|DjWZcJ& zm4EyFT9SjKRXYa4uREuzKp)dj%6ZNb#Zf@k&J z0jJ`dM6klXQK#WVsP^{WHf@V4WNS<6RpZ+Se&sJ#uiukf;mpW;k>&1C_{w;^MX}wA z1A5)&u|t&nYX(NIcsuc5r*<+@xsHfE<3~nPTsjF{9R6y&*)~aB+vY{qg>Db56&d+n zbH5h+_a7)-m+JLL+StJPSZ;WF&HZ1}r?yp&wBU6o9VywBb(iL^Z(DfXKT5WHRt=`O zPxj9MRb5!=YhT8l5^$7N48J zmOY%brWu1$JCbhBtt%!6v2G;J?zDL{if^5vaebijXGub8CQG4R&(A(omw26rYV`0= z3h;Pj9b=X2;&iE&l=7W>K||*cCJ<4wHv&!RwgV@kmt*oZcX%SPV<`z^+Ty<88e0xM$PNlHOJECkEno-yz4J!&d08ny(FG~lIiEi*=5NU(CFQc z%|fRKrfUG;8dAG$^2{Wm5sG*Pm0vg!JwF|l`80XY~0oMpz&?Z{lBvg{+FvG)CR zwjx65zAA(B-w%4qAJ`4~hc6tBHlzqosi|!k={h?&I{fK>udw?wCuy+Gd)dmZp{JHa z@ddE%;@`ZB-^w}U`|aSA?@-h{+NvFR+}L6$F8cq7fqkdL24|3FQ`ZzOkap*c!hxn3 ztMu&MyTFrB-lU&axcUH*nQvqbflh44FI#S8t6z!8la{eBpXA~l`6#r)Zf}IdPSg#* zWL#%2H+e?6efxW;RlCyjRqv;@%S~G9icN9b1G4Sm!TxTp{W(e72p6yEzRAkG5C7xn z+~b*k{4YM{mR!OVn!7McE>o@=u}obx=si;v`mbF^ME$L6HF$aEu#!65g_xJfseh}}sY z^Fmg((<(($(YS4@Qv;HESOop8t;=Mj-s0tM%5z+LMoYi7sm5T+Uks}Dh2i8m;^0cm z*W^%#?H$D&lNI~ZlW2N404!1(vd-JS3CNe$EkkRU<<_BR>2!96@)qY8e3;6AN#@s#x_^J$Ojqzv3eGsRY}xj1Ok$1nZmJ`HrzY z^;z1*$rR0aa7L=z)xabN8A9Snbt_|PbaGmIOnEOf-jm8KpHus{ON(mkZ9f-I0LX5& zYW6DkfOz`OkF>o{A&x_V;wRqsrj5CEE%_u{)PdR_SvF#!ZuoR_Q=QS0K4$C_4lViK z$D4P%%-lS4%T!3gMHa_0|HXdX4Jcl~$3Mz{rSt7wf=Wn%kc`CruD#cBi(a> z*()ur7#x$Vv}z*KctX@g8MTJx*Dt#f7h>RW`zI{2j&z+UPxSCR7(Q_a4Xo(gN&kpk z%cEG8<8eVD04IroG~aXkECp0+0Sb(E%!)Z@4IyF@OHUtJ#LqrB34}yIjtFId#A+_} zA4tY>fL-F7EUCs71$*nqj9s%j3R+_4mSZLdyCNOKLxFnspkrPFH>vxjP#wC`f1oTT zMKI-AY)AdyabTILl|}BJjF=9NEa&&oNtp5H#h_OBvJ9dEi`T zJ`2zQrVEs?5Z899Yy+V4b&7gBRMqm}ht?2>3vh13aG@CMXlo-EE`nhvkY^Dgiq>); zEh=M|pCbyv4jF7y4B@SbWWN7Ek_L0KXmawV(|{D2=Pz1sz}s(nL=GF|!W}bC6mm7mdF>7<&v72n-PdQIwiR!htKI%O5!06_piiGeqBE z<(=GE+9J9DV{;CU)v>EIa(jt$e0_r~oFbG)>HGJm7EMq}tpc=NjgLb%dnQ$Dqxi86 zl{vM*G>vaH>lNC6oLrA(@-eq!{sAek)zL5z(7}fr8hRF~0^V;NcAL>9EDjlTYL=8O zv{wCld3Lj|WPQ5jao&BPe0iDT>3;U=h?I?ko8#$JD@D*}*Sq6LVO2+NK{Ny77>Kyt zFRTyT3H^w8Hs))sPC8k}fMXpB>QAX|lZELO*bi>NKo$GLHK4W8ivP8W*)FX!z#Xs@xGjkie5vOTbI81K2+{!e2jfy@L&Ar!)aODaSd+f7* zBY=Q-8wcWoA1s!L0+D0{zjt2PIp#VWdyEuCAF>U6iFd{6SsO26kC`k%e$Ke@o`!>{ z>?PS69cv6bmru_MBG$!MBHW{COKWiA^qs-VwZcBEIr6iy$!xAkoSWke8wa#g;7Qjv zX#9;J*lZcg_~|HV5+yY}N=oE7L$~1_4pMSx;ZrBLK`?&V8Y@{xkn}8m-HxRk0(Ajz z_8EGYv;xd5VXz9LrrOFZPFR&U)9@ih7#B@KAO#qFE3RoqKyYdg$t&E2wB(7$nX7~d z(O*t!x)N4ToTy*kDQ355>A3k^GA&P?KS&PB}sBKY^$TRd$yAX zkNOCuj7tqC$#Od5kB+u=CEfh!TivkuXxo5s^is=!r1?F=?&|xJfQ$Ddw0cmNo`ef@ zsJQ!t-v1NlloT}7uXR4#QX?(Fwp#GmBFFB@ zBu}^T*WyOD1Zu4QCtID~5tMr2nR@;ounk5%ku1)8lk)H|DvqP27+bP%{BLgJLxZV+%`DK3_L`fu_lm~a?+8|E zybO;zAIc~P`RuY~v!zbwiR5qIqpgd9dFF3KN8xhn3fkm4YsFT6R*pk+B{uN<=LGr{^*J) zWqnvOt*MhdmaGUX58;QL$>(m*(|G;N|CV;!`aoDFEz?Inewhtq54QH3wX`hl8AaTe zDguEgg6GZdVEpL)-*J<3eb9r~Ic=lDtUOQDLj9B`X8Uf0{^>@ao~1AKixO?h%kI;c zn~=+4&k~KB1O|++o%-T6IB-kqTYx)n3aH`f*sHlQoAEU%U6D(IcuDj-hjqL=k`Fdk z*NL||*7lEi_e)ry9IT^)>R%!^hfxr|iRHj-O6)GtwUT3g0RT$`8RX zl|)Ml_5|Vk!DSZ&zkhud@RL8>!#Me#!i|J}XzXu&w*1r>a!_2Q)Y1+%)1CG%-?z6b z;u;G*ve+15o#Kp&ODodfRdzOC-iCL7UVVtW5FT_^oMM~#x%lQUjjpwm7Tu|yQ5JKR zu%0+MInwRO9nDE?DR8FWC%+TAA9+>q?2lg_AKnK9k#RHH`HF1RQk;)yvtlihU_q+I z9%Y0ET{steRa|wx`-f>5H7Ga6sj{o$KF)3Mn)q=H$MJO0Y(=iqH1j`@LZ{%{=>cVS z^+NrY6rqHY-`56tgLS)qvt5s+pmGrsx8i06^M`CbVp+=ZLmpZQwS2P)6Fk+UI|oDn z$8OoCUaig?Nv_XGciJpWlwK^nH8*wSEEuqd>q^Ol678m`p2riYV;9V&1TJ5-9$Owb z*XSR!kq@H7{S9YH{wk=u{Vp+1pjS8Bnnnwk-d_{X+?^F`H@N7WnHy5J(nS zl66D-)w%55?8qA_9K98UU=-T48Xn}Lu!lnZ&n_jFXIH^oCGKAfdEI=0BCnTPh`7Ms zS{kW~lT-8q%D=sW(-rX=>b9RVU0uJ7v#x|GAq=2-lcUzJ9_@-XGEd#}y>j=T7^2SQ z^9{2`hltt#K<;l7a%S~$27Q2ENrK}Lux&D!Gzg?RVW4sxuXLXp&ms^Y$Eu0!8F-aB3W)K6r@W01v@i? zAAU8cXeG4-k0r}t5Ql9xDbjMpj9L&R7T)~(Ay(u71$fd{91lXd%z^(PD#`V8HJPU$ zW&??(NvU8E*jO5nvp}VUN)cNq>y)>Egt_%*tHP|4Fd0t{tVzOGqIr!WAW@oD0z3o2 z-OuI<2ph!MM)A8@r*N>X&k@pFR2B9Ta1%;KQardjMh{%qo7kN?~G z&%@bvVC?hX-;aw;#{0T{k-k-R#-6^tc1=$@>ooA91`}J`AQYBX!O0?|H({YCSl_Q%!GTSFuUgp1i+89SFn7 z3hSl|#Mp|~o>KGM0-c3agaQTd!ywZbYg=kJc@tpQz^r3z&5U56R`r2e)r9ZpC5pwC8Hx2U4f0!H2AY&a5dG%z(Z{ z>qLWoE@BhuPrc)Tdb2IsY*}UK3Zj!O2YWB(xDEXKIg&*pdDz*T=QLjd9Oc6OHean zzaMrSGZ{kiLn_qkkHJ6eJZlD(zw25Ayhd?iA5{D)JA#;tw!;XpXMNIn~^N^j^>h*UTEP4hoyf)dK> zQw4rgHB?2rc^B_ofNOj8l~21wc=2jw7Mm$+!jdwBJ+bEbbl<+IDI%u`kk;pzB$=Q! z9@?FyU7Ij&c#x!lQf>YdFs!$7TiV-G_KVQY0!Tz8Omu}eQrpT|MN>iJn(4~8w4vGQ zK-8?4PDDs>=b&xP#U?J6r(46>puF#o3tY>(MSJBOa(?R|I~Grh5yH-#G*WhBpTF9> z^xZ}u`6DFg{dJXyALYr-kX^=K0D_;CE>#`e;q!|rv z|C4<$Sxr1ugDD&9HJ2)}<+PrSRpW8_Ci7DqS87V&+8Y*HlM1Ev=2)VXjl2L2Dkaksh z6|<#=!$gSBmJWAcERMc8-O;LJr>t@$ML1~UGBKrz&+77s=WBo42YEZEBBXtLIw$g; zUujq^-C2KZ{lt#n={53;W$OGZAa zU5miG`D9ehdMe<$3FFE9q4}!`lq*dlV2WtcWWEhlMA%NGH1S3xTZKE~tGdOQS!VET zQAZ2z<<+StO6Y*+%~>Yz*c&!4G^yg_Ho>>sAu6Z4l?yNj*NQ8r3g=M{67R->Fn1oW zC0 zitXykyza>chdbt9a=a6Ze5&sYtz9nsY*JGbhpp00)fW?d7Ad{dyyg~b+JC|pGOVh8 zy_2$WU+d?LS7N%-&B@_g2O}(HSp!Doy8>JE(PARUE;6pyN<}Dc7#u+=@UkXW&`YnD zXgNAZq?nwj@QKk~;oY5B)gZn3hfRLg z&2LwatfEBq1IDL)7`6cx{ZqVWemb+hgLor4R4TC^TQBSn6z&fA1S_Zlw-!eUiEPM!Vme<+O20}H{S1f?Nr|Ay?y-Lg{-SjN&}J;d!Nmj za|hN-QruBE;_$yW0E|gg?zzl zfBYzZiGvWa9aME87%jDGUR`GyUQ)JndGf042KW>RN0xiTjO?_se6_gTXDqF(e}=2m z6Re54*97`nVcsO&Gc(-gA$JSgOetvVYGFt&&96;U46k4lw#kv6b2MWvz4B*|$bx(Z zwZ_6`K&~je>*|UJbK1NRfHWJ}aMtk3=cIpsvp&XDy{iuO_9}WDAC-~z3p_FX; zQs2}uyw-j@CQoK*O=iB`Zmh?Ir4qb!m=rNJJpC~^mH(I8jt$K^4XC)0ZZk<5cBa8t z(!cvaXNa+#>U)Fgpab<83{JFxN{*9wuuYz0le)eF)N;50DmkD4x_B#;-$W3kXS0q$ z{6(h{K@`A|Ul=6{sepxoxGUIY1elF7ANH7OO_I!!AS{KXGhhlJ?_SHqhoLF8Aa#>C zioy`k7No1rWd(%m4sXl|3M^u|G1G(uAW3DiX_Lh7&s{D+pPK~i`$t+GVFm_t!br0k zP5anWX$F@qKLngZ1HtmOb!hFGq!zAOy%rzVH@!VT@KzFQxY_h;Yz?}^Kb=G9?+_pB9zg3+|+XECo z1iVCzjR%l}GQRA18W;}xmf;K;2k?cnFj08oA&SfruC1@4;{Xfw9*3zeI20|b`ZzBU% z$A6$#mpsma1rDq~L~B;Ql^Rj~50t#;g-8us+6DtgSLrvDU10cvS0kLzK$3QoUI`3e zNDIVlu>t+8H;*)=lrow-2X7NTQw$d&f&$XmqVZQcYGdq4pj^xvLO~`p%-iyWXP=1-_C}5SAh^Aa zaT?Ozr-c?Obb{Hjtc4ArJLDrRve|rPIB!I3k}6{{ohN&rHIv5F3|JIcT^Tn7X{21s zv2A~$lt=1Nd1Fr)xjLP5WgER}7_IVOl1OYH;0g`)O`Gm_aN z=Izq5*?NXca4ZpLp<^LIK#win0rUHzos3;ZO53O)XA{@*ZK_)9S-4OmZ%=xCBK}`t z^#c_|_ZPnU_@Efz4A=U2M?I+)bPXa4AjWMR^n~2oo(yGKwpW*n+{yhoSK_1ogR|ml zAMsY#oGH;GNtvf_Smcz|dL~RyJ9bBz3e4tKTfxsRVz(VooGR|!nm-6L$(U$RWucpYBHpxn}E{Q1l0Y;nwW zVt_6pJe5e2Ib~m6OUf~h8=4gQw116S;W6}#8g-4&PVDQ|=Xw^C6pehZv5{Y8Wp?fk zk}Cri*wxFmDy2?ch@f%we#cY&3vxCzSim_BJ@4!)vez6ibRzWH8C?e8@cSEb|A?F; zn&MRLpJ4@(ucV8|QBHK*tbwT*{HU%^V)O@SexkF~e(m(j2A1dBdY|qdDN#am4bfOq z`qhtUq_|yKs&hTOMw=Os+3xvW+MIRfLZw=0P_Ui*I)Q#aHcypnPCR5J_*p}}{YjI0 zk@rmsRV`b;ACI05$E|IAOO9N=$Szs36O%qH5}n<1HFPy5FlzAoqeSAx~f6aTS(k`5tFk}DuB6T6>>Z1npMPHR`J%Q=C?@zZ3 zA}`)#H@sAJml)aejBzGnCckMt^^QHHQ5s^l@yThWEza=Kkcmk14V92TK0KcNwRF1| zJ`)BnRBWzc#}2GwD!s<*ZgpslV)~Om8#%s`nH;+F?c3B+O_A0|Yk?Yh*nW5V*koN= zXW4bgF9ka_6_JrAcb|z@y$auOpONpNxo1+2swUbAePP&$!zXN~R#1uN5g)1*{@_$7W>jT+&mLjN~Z z_vOPqAH>f!2G~-A1o}1zpUyV;JUkwWbyesHre$i+;k2%#wGw#Y+uVl7qP$aDpA7dr zlln;Dv-Ruv%6@vGFNDYO)F;MxsZBS2!s{IUDNDBh0e@NC@>TlNgJ<{RY7UJ5=@;v@ z=5gM^XsTDo$Cx`c;rXhICiRBgPZRHSa%#6eTl{?GH?xtNG3f5Kb1!PW_K7CMRqMTm zw)cZulX?LUCjXXiwh2 zlEc?FSA3G|h}g&=z{IQa=3}YUw?UI#YJjeFuDes^qDd}O&zwiGmbi_?KCVBV=(93T z%qQTFnKqCI{nLqk@Y4=kjE5#qxSs-8IIycry=2&^*vU4%04lDhU4&EccYLhL_F=n# z)P>8aSdvNsH~5VvM_4R(kyQmkX+%(2e+h(i(R+j zA09Vu{8koj?Ot=8z9eapr{&Y4Z;1&%3cQ+sclpL3+6?=o_z%!lI){5Wc0FrI(ORtp zGEZl+gcFEB!8wpzt7stG%L>Aj^wHAs^v~mY&q?Bkv*7*(2q{`OnTN9)Y5;I~WvhWC zUE&^!fR-u2&PhIffUZJUt%yI^a9{s7^vFB1O7%gsShQGbNSgVjCm7o6Lo&kEuLXHd z<_iqhfh0SBj2HM#hFk6vO#l{j*ka3LJ!2oMuAP!Jjdp0Vo77J(N6Iu|3sm6<3$ut$ zAOmLmZl#>wvO&P840p@LsM&;KE@c%l3Q)=wCsob$rvqX%e zN`Z~R4qb81%p938SF0aGNTdx&!-XsiY4FK;H!PW#%8iA60X#KZlnh7Z>#gdBa$BG^ zAR7=zWhsV%&^$oAlIMkC4ZwQFfgtR?p-Md)H-rFSkX$Djxt6^espWK(v5vL4F3x)>Ie|r*D(_`QYF` zZeC`{1Zk?aA3uD~QJGx?IHds96-6Mn8HYe9Yk{qBaUYtg)1g_cM9osIfmjahOi{X^ z(GVVm)#+0;t>&_U|BR#ZgP+0|7SnO&Q%;bfF8I3bk%yH=n+^jIGTsH8GiS~rp7V#>CD2fQl3IT}{M%L+q-kuQ=Zk#Xwe*7!=P!mo28Rfdm;a6r4D^hB z`g`ntF~G-u`QxTgr@R<5p)L*8_4{_!FZl%@b`Y#fGNZa`YjNPxr8`HD%#u1|txW zWv4D_DtCq*g0?pd;`E&kbHcK52=(9eCZsWtqD;^3=|?*^YH6eO5V!Y+4ouM_Y%6XV zXZR{F0JM7`664@+O9j?h34n;641CMZNy5?qGa1C?B)w(}sMCNT49bmVxcdWyMJ8w? z6fzjfZweF2Ef?8}qc7g-8dk~`*``5;Mk}^}GEB69X|_BK2DBRi!%FNn1=6O&0&hYB z`EoI~y!?=u(X#$IVE6-&i_{esq|J}myhO5M+<;(pY{JZ0T8ZR%8S@p#y^*YR2a`cX zBew)bK#3P07P7wVxZ>19HIV^1p!~s3R8w^jbjW@X*s+9$)9G@s7&i7qw-OOL%r%SO zLve?%uh^$oYG_Z6;Ls2lLkq^FKtKmW0M7@(1$8ePZY^Iv8Im_KN+A=_T;pSYKnOI? z7PQT?A?!5@|MVs++8_D4Ms~Tp;dQbmc8+M4;B|$EV}kr)?kUYzLJ%vQEiCL5{AIr4 z%=c0@#(vX%aw4ZZWyTn{*xm)Xa1CYNL*-0kr#LaTV0e;?A$cFD040JQ;UX|0H-O$( z&W={++HGU-chk8U$Nxyr6Grf7K6Xp#)_lvtxTR*C=+ngl&}vXsZ-FxWr<*e$R+dey zOQ4Ix&Wf?IT0Y&s&h|!&x_Wc5J*MgPO;z6cnnm}XkHkYn7M}~^H?&9oQ0<7i(3}Ni zkF8)NUAZ~7I>>g_eP;G@&0^gW`pIq{%2LI#duzq&UFVC6(0id|#>b>ZM!uNz{@ZM$ zPLIk@&axWqzS&QQ8)$viE77;|8F*2esWTJ=V_PzZ`K&zW$KOFok$TQ43r?Nm(~bR8 zVWjNFt^y^%1evXUXldm=J9Juz}H%0R*^cI}r{gX!= zt2>VWjBoE^NcP*`NHI~Ebmo%(SuRt1@~na2PNlcbOw*r{7TwmW7|NR$%r|l|8b`Z$ z0`;?^79fghVN22mPi71(%ven~d>(Dh%(#rs4SL`j`$pHKQ!8bY{n{S@*2oATN8Q1q8DF2xCh+!Ygv zp2ycPkeQz*dt!Y7_dUg^xqZ;yDX*6!dbTgvpdXo8r=*h__0B?WygbS0Xl1^SJGT>c z?cwWOse>Z_jiXD-xu)0TJJ2uNBgHP6th!fT^NxNOw0VN7_|L_}gCF$OpXXiue!mN( zPIVl%R^9KNirKC^@k9ZK8hWp!vZ`*R8PAH3>3K0VA2LOnWAJ_{5wWB3)^`3@RM1(H zu0D}1%8wuZ=+qJyUWKgD_z;b>+evm+e{ghV-X}Tj`2sR-vAX}L`b4DR9B(wN>PwL7 zQESi+Lnp!9aWcM3zYGWUF|>xJWm+rqHl-FSmLds;5A}m1<}&jk9i-$xWR^W+a zz`q>7yF4g}@2-0&h>J>-79Vz)<%MiAQnwM4adB#J=Uoe`OiP?6KgmPh_ce1`nw)jPyu5qR~IO%F<_MiVfQz;{ZDll_x;aO-1m_B;VJhq^Fn7fy$vz z?aEqDHcS zMBiwg&C#ROpiqHj*8$AIp+6)ohJZs@bn;xI`-e(?X>x$h)MuI!S&artVq?*Fqci#G zXmmN?o#j=7NA!>bDuL4t2{cI0VFpD=1-xtkJ=lqX^Swb|!V_T0z!#FP zTLSSR5hEAwZAzo6aRE{k@L21V-=KV!4TVIBwgQS0&W}=ib{Q$^&Hav0PV&>URo`>b znL;PRD{NXdNC;z~t*zi~t#lu^$QHeTW1q)CbZkMW{wO&sNSmvD&lVNJ1c}@!hEKP0 zx}?aw{h|NdMZI+Z6CJb)$)&&!3b^$x$|)+PnrnuX0+&hP*n(nfK=jj)gDtX_I=yKM zP&t=5IOi)BvVe|bk@ZxQFst>W6>5q-o9tBgV1$s@2ombxA}_ig)kUHZELY0q}P^zbN!k^7@h!^W>XBfbuEa2y_gK2k|6wQOhc z%oa4Nv2#UC*rLeGVZu7N4(MWm;X^#i@_4yK9!o*Za4y138X!{(=vun9(+R{|x#A(X zD{u*xY<0arBE1~vTZOblbnXu~fzzjR&lsaJg%RTB#uN6fhTC$D&mj$yHmy$l7|sRL z29(dF{_Cg)b2&u?mc)aj5;!$OK?nBh?m z*=5DH75nQ^5oHf_cH|0=AIW{FW6zGT{%WwWC=JU~bPv^}ygQ?%Qs~tYT{?1}euRUvif%`2X^aHtoC$4_4`Md#Fiqy2a)*>5};A424FRZWq?&5`22TsKk`Z*%&YQFyXZTuGNLbkF3 z{vYIv*U{fI#uRGin@i2GJlvyotHnDcfqt`l33ER*;&BkSeU{rc$L&H&)(Utf_2RuZ z>zZA2xV%a7$IKUq<)hP<_yRY9uP#N}v`BA@V;Dcrdp}Xe8$-lUv)js5gQJ8@39b6) zG5c>*-+E)FXN$ThV@ekEYeuwI85yT$TD2K+R=W&3>I*v1Fh zJ;b?Hw~v{};-bSK27aMvCujN7Vf}3-oTbSO6M3Xkzz2^%Q~7!;s7~iJcHJnVWP#Xz zW_`T$)W@cnW2GM&etZngi`UmHpEH153*O;Pd z){}DvRJ$|6AGM7|y)DmoE#8OQY&;#UXqKGI4IQ@t6ArO#*>YadSpHVkpM`RT+74uB zk~_+}M%(2Jyx_P}p6(ZVjr_ky^*`@dZeRWw|NRkvBWfh?W=q8zj}W<$M+MMzoM>qi zL|Q7sYPaOKx4&t|44(#w>V=ZRuUwve=)skGGq~uCuI|G+=c)4$k-0gqIM;>9Q-hwX zW&SUEpBqP(5IVTf@Z}SCbk{b4Xif6ps<1aT1md9R@_0D_x6EMBHC{ho%7l7QmFz-# ziLz!DUJ=bbw8-*x_oWhz^<5X+C88XaciWU>V-`=MDkSDrB_CDG{K*^YGjaAf7A*TH z@5fOA-l8ZKXMF9@5;HJ6@99n{B<*{?|B)?EVBHKfS z;E-G*eei8~scNx&AqYALxsAcI7oVcz{Z3QPBL8}Z(oQ>cO^xVpdGVX11X1ED}O z8NXY3_YS99fj~<%p1DTujBPRq3LPRD#gHP?1VI+jp?%Ww-nzWnLjT}d{Eb>MwzgF6 zm>h3NWyvgo`8J;9uVu6cRE{b`0@neEf*=|~orz^>><@wEcU&FX=sP)ulY&CT$-A$d# z1_D~Q!~cZ=mSV)=iOQ`TYxd*0LL~0l^hzRm-xbL(#}qJ>MROSfwXTBZ+vI@qp~2Wi zEXHmWd$gdHXKtO^#Q=OZIExr_h=?P%M6CgL0!6Nw)(3(HvJ2Aj?4|Fa2OJ?c8>fs~ zsOuOITC5zS{Y1ks%eEH3Y0|4`z0Ob&3FHUsAfcfcB*46A3oe%B7;ga$$3P+GZ~7fZ zhyfW)a%F+>JCHkANE9r_ZeS2a5CM$IEqeg3%E=MHOK|=HB!cl=OrL?Au)?uc4PcIk zb%040kZ4#3Alen1R!glVIaF?iG^&ke)ejAg=WHc(?ZYPyQV5Y%(N%mxt}*@NcXrEu zK40%Nnti>!UU507yrx6g4jz0n6s?Tx*%A>x!Y+M@4zn|$f}bN{x3DSIM-Fhhdh7zY zAfFlhD^1Y=LY@i0g6(iXy>BW}$Tw*n;8T#$W`%%}jiN-@63}C5FTwvYJAF3{;2;X= z@*Ew%!`S+N=t)&C&XR*rv={o{nUL0_idBJ%gz`hM!*4Y}#z3Ug8fH!o!~%k9$XB?a z1-J|8ZR|Rld$>{^Fb5$MAwouMIbrp4%y>Gt1Y!vS71YbXfeuX|MIqDEBNT-}@ZQh_ z4!A7;Z{oZ`Q-%+*%(9svB6Bhw36wR*)5X@D*2eEhllhip(ew%yKM8b&N@D>dTq`iQ zQ`_N8PzwuCD3R)*0r;atj*WAYI#+wcc)2L#<6b zCa60-Kqw+n4e!=K3Z1PKL4!)D-}P{q8TJvo88%n5!o0X>^~$Al*){h|4qB?jtLOLj zKkIzutAga4YZs|@!1}lwCV_%=muF@gR+!H(t_McBK6C#&ynM;Jxcbyjt~+3Ybc(Qj zT0n8VB&Hvz`+ASTa^JqCXP4nr$^WACVVZFiZ1m0pt`Wtuc4HvuNvbxfo#*> zTtS4#(=^ZmeIP}E(&k(Xbgbof=(VeE8OSb2PM>eiN`mfXjMs~*(wi!3=QodR4(51U zI(2r`B0h{hVHsVrX)!SEqFHnLbu29%aEHdHHd-_%Js&w03rNXyU&-J&59aXPBHSC2<6M;a|1tTBsU~Khy+P{$<4kx(w=V9ZY`y-AsjPaNBCVv zV|pbVujXIG1E+v2ke8jJdj8}6MJ6gxdSW!+ zC$LGEnVDrVUHPf(C5%vL-+iX{Sk4%ES@X(czP6M!}V9?kl8P-_I#pWYOvC@7aYSOOKUjK56#U zT@Sf>5uq5L`fEPpqI2X`;$1tQE$P`Z#+S)wrHi@e=M)mN6!t?J^(PI(L#98R`qX=m zda`3-GO>0|_*&Z7Gl9q}m!I2WmaKKXmffivStZXpThd-y{2hI?CiF`3);CcaTKZyW zi2CIAA@9S@qZfiFpS(7)?~%J5|HKk<2%gTisCwsii!mv)cv9bfN@YddFuP@JGUwgP z!^vNpo|cq;#pgQfhk5BPG)UymA2fq@9_cA}f2lo_y%FH0CjjUG?=?XwQ*)ge-|R_Sna?*qNLPQJN#` z=C*J=2%e5B!>t$9Flwe6uiG*zmugjn7)q;oWtEO6ui*nnY0xcS6yI@Y+O?Q7wtZd7 z@OA~%nL5$8VZp0<)0sATcP3*TY|GnUYZj|bNIt$2Uj2djt$T}Q6UcQ@Ow{tv=tx|* z$iII~rF>pcl!gQ5GcK;}n8~Na%kjOf8f*#4GBq&Gx(fiKS=97-=3knOg@~Vgb{!BB zC>tIGIN5$&p%O)DFPK@N9)_C>G|C7L2e$49^ ziKnL1^BV>}c5PDC*ag!5diI1)?2$VZbPfL$r|;ZZoii@5>%X%gI!ERX11YsyW1fZ( z!+;o496wEf2i#~xAQIbr@x#viZ`9#p!uoO8NHMi@+4eaMR z3kcI1)Y&3)XqY{ad$+$Dy3=GXu{w8h`qso#wvZ9l{VZk}7{ zw+`%m7m(qy75D*F6)Q2eMNV>oi6((yqOGSR6RC?>E<2k!9A79`M=iiP@MK^JOwg@j zX`r(Q(&mdnz{^Qd03ChbE%U@F1))@>{Nkq&E&@~N95ZkDC1t ze&mREatTSU1VC@Y`fnnQhAM%Y?%!l^t@HrWArR-6k!O3PWr?ISj*n%CFu5f9Yr0NO zf6~}s;?fiUxibvrYIXS2k_6`J9M$DoL)G|kd}@3-NyiPSZPLK-0i23lNr>c}QT!Z$ zt53H>><#jzgi2-(G^+1VJI@h;YShz)S)es%$m^&BeZ)K$J6b zF@(GakO47(t^+3rHiAX6;EdYSJnXyyJ}lQ~n(+&C2F*Z>otw3eGy-T1EE52U>dev` zFOSw%SNC;Yv2p!O;_48swdaYghn}xm-5c@FIivS`7u|PVnDvsnuKAQ-*kMF;Plzm* z-Pra<`SH*4|3D^!gKX>5$N%;YtV}=U@cga)%YLPCAheY8K0=HY5cXvsB{BTrAH7Ln zwn5+ZAF;Abvod2R`%$vo7trLqyA)>bEn z;T-_AvbKWTevCrLY`W;Q&EdQw(SXX5%nl}?698q3(yA6_3Ilyxa}ctRtuf`8ff4*> z-q({7&TS2^A-FiUFd$^f`(6pZZVnC34FRZba0oU6Ng^kmk1hH%w3k zyC^|RKrU_O*fgGfiUi~?D9j4(I*o+@b7dxg;u_?OwvrnG-ND7XIVeLwx94bT-dqoI z{MEUP**A8`o4_gQDA0J7Qa$eq5YtRU*Bk@emGWUnoyH<(26-yZG0DH_Kx8mFlxvs9 zKT3ina%=-f>3(6&`sxNi#a3=0d?sFMzSIVDsj#-!S+JV{fHdFFjn|<`_nwt2=U<6@`?JbBh8gH`rw}xuoGLl4xCFo- znAgvqJ&HX#&$N8I|MnYd7FBGiar|~h+}KdY4W1yGJiPeLb>T+`MANrM4-VfBJg&X; z#z@{0C24s4p3<|>p<^i95lOWVUGnkz$03o@9yw`A*q>KBj2{!Q_>4az>koAgg_L^L z`)y?fpjs|Z>45i3XCIzYT@rkD>qD-~M0`M*%KejBAIefYM!4Vp zdsKuvoc2}I#O&mww0m94_L|)U&kGlv%Ol-7+^V? zo1IU-M!lT>tKsup8h0YpNBpdO=k4Gp^|7&HiODAxRQ%lkhVGxtn|gL1t?DWDwcLLX zm(ZB)ulr5~ZS(aa=v|eQM%_{IIlD@M_@AAtcDmgB-#G3Ok$#57*M3GNj3)tO;Klc1 z)944AqQ7iq&9{D^`{b02j_66OJ8ElxlP!gtt+6d4;QIfWno9%qG=3C%m`&q&f@;gf zTK0sVl^LmJ7rXrPT8Q(fz7ykBo}eD{1XbKaj_^i49_<gtYac-mKR%Vierm%^#n9(pi zMGHPztgBY9e~l-&tbA*+;?}GhDK9hM@oPWHS6zh!&6L)63J6-uq#tj%S}v_D(tdNG zxpkn`&|QqMEp&u#=jXh$M|x2W36W+wcKN%eV_|G$J%P2p>+T#iXkm}I*||?iahvXs zQRRa7$ZcGYOcO`9^mI#+LFfY)9Z**o2;Ra!jYTZ8v>_qW#gM@d&O$H&P?r;WtZfnK zGqF``HO8A2p5U3yN#ap}hm036AmZvEWuOoyd!%HCqJ%v{GGxgkeP?12hlXYCOdpfk zKY27%Z+v`Wl;jbi={)VZXx_G;u{U~vCiwp1OQhe{wWGXvkz9&tOaf0g?@zRxUL~RJ zRQ?Wq*O+doF59oTn}(5h{+%5XSjl^gi~lKvnMH&6f&tuR7aZdNZj`3fQ_cPdvxK>*(eY@%ExcAAfRUQ4O$mG6~EHh_rLamQKvfRw8u^eW^{ImEY<(A<4?7p4Sg)A}WDq_YSx*J~M17uXrDnb5t}))!4}jp#M%Tqh?f zK`LP3qduAwEkL__au#O5Z}W>=MWJu_C6PjEfc4{an<-#tVW4^+O(9eJpGO=h+cmGp zd39|t_h-tpreAt@RHg$97sbK-xhqHkiWa8zog?~EVv?V+ccJ|IkGMxkcbU%*g=7_+ zmX75?{xDKZtl|~L6O-8MVhH(^&;)R zI;=9e$Aj2300i)uxJ;rC{PsAJG-5TDku*+LiK=N#RprLzM$v}l29+ZL#;@TumR~Ek z)BM+gIQo((W^0dbn7ZcPIwpKwaDLf$bj<61KTJ8e<1!yO+eJ_ zZ1}so(3Cd7(hwz9=A@D{20&M>-tlH^!mO|zWnOQ3 zV&li7!Fyb+aJf-BnMWt6cp)p;eoe@9rl}d%Kh&izb0SZ53g{E;E0BglE1*kKbA>b* z$4QW1ze-rd7!pMztYHMHY_&R?;3zw%UuDMJq_f07oljKSZta7B|3+Pf=50apWt+IC zV?ky=??e^ro2b_R6je13#ZJLj^-Im(b?ntr}tN|(SJEAz9Fb>-njYhL%A z*sXhD|Il<#=hI&o`VMWFXtc38QrTB;S?4+4c{)|+Kj>k0jkV3idg0m9;awURPpxo9 z7W4C2l4MoI{5+369QOonZP_<$L;Kn@&4x3*k3VeBz@3=L(+dNrAcML`p(IQM{+kKc z#!;4_rh{ZA+4>@9(hD16mCYP`m7Kp2E$*Mzi@D8LG)GBe2Ff@Xqf>wa94MP2IXmcC z6E#b^I{DJ+4v?pUjHe{#&e&m!D@pl}ga_)inOO!GZXb^?rj1A7KW){dkCf{_utDr{ zLn=wKb2pHKje@VIUMREu8D3Pte8SNXdhhlT+{LX|_GnwEd80qx82CCn;D`#bP2!2l zRb>^u&t_cSms)mEx5_M=@#976)(>TNx1TI+;0vs#ePIQk=IV`-6b;-d63v4$ZpWjO zHUB{`s_>uP_u6IF?d`|aH=B(PWj6jYe*GV`vefTR3tlYhUC1$G52nUzC;aPJ_~i5%!ZyPs-7fTe(Es+=lvXhRmS&}u zbhRzq7{pC%t-oe^_<*SaqYvG{{gFQ&4ga3y>+`wRft~Qo#Nt((!FaUrYN4r(+xRm1 z&2q+zlDUA^EX>43WW#au3Ci=CV-G6^%AB8{JZ`hzZZRSuHuqb`nJ3F+ORC-I-MiZ~ z?zbH28l(L6^P?gyRrw@vq~&>)K&kg{@&35xFT2P&C*XkY}?)sxWTu={%Oq(NbR?K zS^3rxZc*5Z%3qKi?MT)3HkmPF|C2X9X2i|v{Vt9b`46!5BRhI3fUj2DO2-2D2P z(mCez;(yPh))_bMx|fjF6%cd0p|R@F`OW&88!b{S@W&r>BiB<(hK^15my`0|=*W_2 za`*m^FL&+O8#z=cCl{ZQZCNsmojy<+9am%U1OK*GHgemk)_{{9W6~7>eTx8y> z5$bhIFKXLTg0^efdb7_)?^9v^=3CEzd4?`+6I38+kG;?uq;l=&Q9KIfpp{APwyIh$M@iQj!qMRpKFsLs#v2BXBH!INBH?8#C zPA~LPTb5?<`mZU|_hdTarzN^w6UDd^AY~?V{7DKEH36kwd^}T(hDZ^^D_fu?*K;tC- zT-r(dcY6;UiOc`9Dgz2o%QygU+j5Jczt}C@?<%9_(0%+H7eDs^Q&$0 z*|2k6k^d%u*w0hTSY=Pw?!p={=0Ma^@rx71vj85wB11nx;6OBo>oz34b><+CDjFyP z+0l`oGH9v7DX?D}pa5__RJl76ie%e<#Cp98P)HJ@z#0I7gp@#Z2DGN;NmrShI18tW z)F*X8khAXHNwLw=D60+RB(8hlB@m-mfrou0erZ;``65$O9qYC*niKe2BI4B%y8A6# z6KVuP?=tL@k=*uTmwqT}P9T(kTK7E+TnAZ!;QBM7Hi;O4y`$i|UC7D6tx=~;ar&ck z-hJaM6oaTt5u`N>4BG8fFkeElK_u9L?%l5kZPF2r$vxt+4!!VPpTOP|0w>S`g92r^ zs2nn@8>=x41mTAau&1-RZc~2Bd1@eCJirCVy9y2l_JnQC;w@z4vy&(~X_&S}fs=|& z65yjN&Jq2nd+|N<6*X&i8Soaa0w`x-0W~MU!`9m~(8E&*>ECWul(Maq^y6&_Dqs&B z1QXr#0|&sSR-s~FJJub-;1==r2xNB)rUR7i+&q+wP|;<-51(#gt$Byj53nhM%;@)s zAHg=S1c`EPb@k2_#Y?}ozgoKfzt*m6p_j}gpd<4>`)T6y_ty0G*oh*yX8D`?K$Pl< z>&<&NWpZ3XKUaUb%X7SZD3eDVM z8RWzrL!9`JYPfgOQm%(CQcTMnWxKuP+DafrvVntiB5#zXb!&>9U2bzj%QvnCo8u<; zuZ$Qc1F>LW&Vm36%wq%?xzshydb93bP6tTzA3@B@p#PvO=BC1dJdlaOt{oFXW>$>8 z-u{S-g`($EWNj{_W{jJusIdhxet>x}B!%vlC@4_Z2<0Q-C-hDGjiIsd7@;no;Nc)u z{e{ft6&)HI0kai9Oqc-$Va6JYA1zhG!GY8WHVj{AkfgV2qq_e>M=4|IEm(QszYemUu z5(r<~3kc}k-9A7qS|)c#|31780^=a^DpRY4nmHkGO2?iDXWM~-VpM7$8=}g+{9-}E z$59%-2Ur*40GC9kisEwxK>|neVd$7*@59yXA3L_G_LO zBsgOWqY7H?7Cd`bp;7d;@(PMQHG5<10rTm9Q1I~??)YJjcWPFaTh`zGfamc)u^E6o zK=a;#XZ;+>`Es$U60RJ;t!glpoVp4&0k26_LTdN4Y=S0o0yLZeNtkuUGSG{+kT^?3 z+{oE%jXGwId(uuFrs@wc^%ua0Z$%o-)b}&Egp=(WtAU)+S0na(1-PMt2`7wxN8oYY7ausf?`(&Ng`f2|lRfwWg=UL3SvJ1yBc ztux(G?}_e=Hz_XnxJu;O5I^{a8edmy%39a|B!c9CpW3%K-s^~22P!G9{oc#j#5UnY zWYbykPNM~rs->%ejabwQGGzIyvz0|4vhagRSi@4PEDpE4?eHF6e4b&G-S88av3fh| z!1`<3zI8tO+GpB%oW^b6>(P0XX5+Je*II{v`bEA;hYjy9H(Ts9B%L!pDS9@P`L=Vt zdiGRET>d6T^1glVu-m?Q~9e@T1`$MF(2%bj%YKeQeI(H<`P3 zM)6on_rSrW^Tr{k3Y#tGls>)<^;NU(b=>{Oz^rP5iCWY9d)(j>=SvMczp5Iq_I0;c zg>FW87nQTpKkOsydyKJoG~DeGJHBN5T<0Eze>C^P z1rTwr=d&u;;->XZugiZsQdw%e*VA~*-_4AyZMhXr*xSiN9tjqCCGYNU;_k6`0weakY5cdVc92TsL7K6%!yeLy=3l{1=R1ZC z&zstP4cqz5;#|L0vemP>P`f1T*5mO>{dw{c;@`OR=_Nu9uv}lu7)U?Mzr%86o(evC z^o%#kf>USLSxm!=4-F6RcycUY*8=n(WVVMq163FQD6mzv9A3Ou`}N{r3H;>-d;E^gD7DY`(Zp8;lcoYmHNN?xMH{7lmRB+8Ha@dM+1;y#1y;k0xFn)ka?N zKZSkO!w94MD5Ai6JU|U0G(MX$%Js)R%*K^2R>GF^iT*BGh2JTt0Hd*fz0f$iw74EZ zu-8b?azM+5qYMW!{cQ;U>t7)owzSo(n)`pP&?Y^y|8bT1hkRh`aZQLmP#Zp6Sr&Jg zE^DY>s7|=`xcjME1l>4`@?Kbh-;I6PWY{)w~e*W31XlG{&2dI@V(^1jD zDOcDF_S4=Rsx27MpA?vwrKf$ASixH-3h~a^eNxrmevm-$g$&00xEO=FG76ZT zhXVA@3E>;hdMj;%A=zz0D?eQw3n1=cS_>7@A{xLPVLijCc9U{H0GUMfB82FVh+Dr` zC~iwaXABH8wa6#4I%*(Y`wEj%sXFHO48umF2Gfd)^HT!3f}d-T8ya(>5m3K?QRFJiEr`=7FpUDVq9<&JAfWuv zg@V?x6R3o4S0S$KI`4(z-fi33U&x8>t!ieDWFD`sk+0Zccpj~WwHF{{TxI=KS%;I1 zL6dU&q4zlpqDZbfl_rLWbu$h@gk`FmD=9?tYT2Ry!9MFf;b<{_Y)TB@VwgH8$hcgR z{Ucb@px*h~#rwIrMLR<;ecqGo;4IJ}Cja_jKweL~*)kfnv zH2J&F!Iw2*(vFotun({ zbL3)f`;nliZF|&HxyiB0w1K=uxi}fs>Q6ejilwuNp3evVx8|4`USKjI1ost1;JzaR zW+t|V9X2_T6C?MDq5wWvBif;tmZL&2vq{4;cH%$+C!We|n)1L;`OcC&V+qKa(h~vz zH=iRs(dE~s@Cz{JNR_95P6N<1DMbIQ02N>i{^fbUPifVn8n`MM!l2Wz9^ipKPxOO* z6K*an?KLJSMrY0XIBD98@&S-48Nejs{6p>DH&23{k3%`h@f}%R4a%dy#tQu^;R9d; zm9-5Y1w!oJKwYQaQ@;(DMom^BY-h}S(8XJT(HX+x8apfN(akc=&w=H!2%&Ob>k0n6 zSAj1Uj)byVzxC1S7nz%IdTwG%66Dq&9jHLh_a9;eD2z+VrIDAx+2GKNY0H2kx?f&f-F~9s&!}ilhk*}i z;+?Hkhm9txJ`E;}JpOCV`l@CBg&Sq&d+a`6l-|bpAmtc6OiCkHQFLTT=K zCta9mSS0C6|@x z`Jb@B)wDrkiFN>NcqrKAKvq!tR*#W=BKtwbN8NYVPVc%HqAF4hz)9?*qDob+1vguX z_ch$tK8QNlLLa#JXU`5h^s^WX4~wY8uMzg$jxv?`fZKyXQb?&KGC~ zEZyjo?mw_Zobe2Jz`d*ebdHr{802MBlQ7i#`DpbsZq9m<2>D9CD^gSx%L`uK+41Z} zWptk|)}k(<_QUber;lVKuf5>^AZ8j)eA{tDhm{dznvq91?3!Y*Fch~Iqe*KcugL3nWJ&ZlEV zRs@|}M(smsD!dDDhs)(odH%aNMb(VESli9s)a`rs5cL z-#%u-6@U2P_2;-#6s^wvrw7A%FHn6Ojv6H&Cs?@Nwf}3vZqVfmLOC-#O?iLM**{I@ zXJS3#`4~gYjj7|sN32Ih7Y}V4Xx8fPdajVXpiniU_T=Oa%S(@p(%Pm1yLU6K;+?TW zmyur4_;moEeOE&m8hnn{CUH9H*r_Y=PO@k}P2vZ<51-{)z;KLp6c7Smye3}@3lk_|OxzGsem~%i!p$L)BOcO*3pw4mzhPQWpmZ$n3 zv{?NlpY8K2QPq}Bjr6ZUlu8Xj4%DxO(uvH@@N}triUwKfdK~a51Sl}UL){#A3R$$q zp_jPMyHp{zyDa*6wCaxh@^-yE~z{1@(teU+P%QArz6T4!Z=1PV5_pW9wN zeqfcmQTcsGx96Y8H5Ik7SyZ*UZ6&PbSEsdZFl`CQy7o-DG$*FslH^B)%0NMg?Lbxs zF~ZTp@^BDqjVw{dirC>r?I--ecBaXB4N~OD94)!ocu2ve?$LP&BN)wFQ3)?WLW%sq z74k0+=0OQ22$6OO^hdElKg1+Ya;>Oz5MC(}B_hlGL&@NabvNi|f1)PPe&Ha%wz0>R z$R$uz{z!?q-Bw~zO3pcjE9rPSn0Lmo41>&on-#xBLQ$wiP_E?_N!ySYM_K=q6{HBq zDlK*tfyYiqtpx#l_OI6hlp)El45JHj38ydYTo8VuYJfe`+O9T%a12=nHb+oHSkQT+ z3h@6^Ir%OK*0zczf@hNIDde1Q2g!bdx$B3tdLlQ<6Q>$3V#8)VtS~)j?3w_#DFXat z1VgG`H&>b(0D)&;Ermbn{IDov<|}w-gO^*hIp>E=Yl&l<8=9hngmT_6RDY-hq;i4z z12nnOAE&*#R>+e%0ibACk-_pJwSaiKM0YeD?=x$0mLbvHc?)4I*q zpy)m5>uE`hT8+pTlnaqii+t@kd2%y14WpU)bBH|VB#q+wxr5ijsc$4No_aczU>rw% zKILQLj0sn>GVL&$%q1o3^n^y1TGrsvulR_YCOFF$YwDX@;HWp{o$uQd>)30(+`|Zx zUgoMZqChYs<|jsg_%K;x84Cqa%<`$=bn3S-%kZnKV>;+jow&9<1{tLVpjb|TA>H4r zjs^55l%EnrlkCLJq%)D+Fd4n09)z)-`hI@wrtsnBbE4A@=aO#-w7wcFSJNes$#!f> z4=JBR6|q4LTpi2cV$hitc$b9)j0w=_aVbu+W<7myr)d;`Mb}j(_H0mxv><1)Mb|Vk z?~FpR>keBSU#8&K|NiA*i3YNcnWvW`-c@e|_|Q6=OI@Y`GAOf1=_c z9^6^hv!_(2F`0)U^oyUPP4N^vLC0hR?iDWWEF= zRBhdW_~2Y6-pY*no^7z0H1f{ttLw=0?4FZoy4yFtnz`SLaN_*n#dQVarX@Yq*Hueg z`m^?=Q@3rgG=Nq8e0(?tzs!lZh}r$9H1f)!wjHSEA;HQ}+)o)m*in9w6H|ziOtur}>vr%KFobQ!ZOG3OK^Q z{B-?;Zw&b&Yxi=8O%$sJhuQx@fx(%~&JAA$1}{Xm*CPxr1iud{o;}Doq*BmsH{sZ3 zj`pVo`usembT_E$xbvMGHMd8Pk9tTvIyqK1HYFzs*6OfalQRZ0S%Rav%`INlr6)@d z!ALC-T*cXOput+Ve$R9_B{>-NjvQrUfR#cd&S=%2VKMe%FWuv|DtxvzC}aYwT9a1zISB!^nt2=5{9&rAt6&Z>@yS6v$^qB-yu?U~B!MiqBeMGpq)vUKY8aCr> zfjzvo^b7V5X??X*7Jy?#S8vu=j@qi-%Lqc>%TbM4Cf_{@tbAZ1=;S zKQd8XatkiKVX8--I|4dyl5hXRa;8wK`LgfV1e zN2szcOe<%BcOV%GMI&Nv_WX5RnpK$V<*uJ*P>|A2q?Z(L|A9Xk5Z?RRC~m zzZRMg7&XBt%*_7k9=++H_nSzyGn8ZTbYY(&x z-eNNwh7=6{X*@bfO_7|)Pg1WeE)8%g0wo!MUPeiia1r^yjP<*Lc}aJ)i~utzKM!XI z^hcR7)%!E&S)??%qYUTY)6XCSZ9Z^3OVG3@xD&n)9nu6r<mm%tON&D!crEI~BrC%@P^lT;|xJM^p(Ea-_c!GxGEO zx_<+7(mu+U1K<3lv;yN5!8slaEIXisV};-X>d4T4P(E=NhuS*tJFe&3st4uqz{JL@ zoS|fe)ai;$jpde|xWk2!XO5EZWVOOmwb`~DuZt28yID8vo}M_D$U z3t8EE=l7D~h52r6;dRhE+CZ)lgo9cIu>L;4;2mH1O%((joDRjRNh)nIj}2&!F! z4t;X5Wfa?El=+*j5bD+wUiVwnp}9~28ebKk@O;H-Lf68I6jaEp&xOI*XFHRlT1d}y z(>^sBgx*~plN0`@lJI~XB+m%kZCN5R*poWpoaZT@6`EB^EfGh&sV@} zAg9uJZ0n)u3LQB+MHqp4RE97l9m6^W7`KiV@mA&6{ahL88TPn}FM{y^(6vTR);m^J z#vo@#`k|&(iaC-QI0(cif#Pju019Z=C#&CP9*2bzg&;_`uk3#I-z4|=x2p^&1x052 zr33U)Q=8~Sy{FdLUBtuS97Nu+~r= z*kr7A1|e*>*t`W#_jL+_I{&f4dGEu62zhn5osq3ykw zux|)?y!m9v;)a)Rh6J~(LzE6&K}tN@ENgNOXUzbUnPO?gQK_^`d$7B=lW$X+M(JGS z-=+$Zkja36sT?(bwRCR_$i6}9rGZ_7c7S72wJjv#aaW5u)HQCL{-(Ckyubn_IC9>1 zb2{}gQ=h#HyDXTJ80mWzl00#d%uQ7Sxd$zy%_Z|tkifX)PG#%qGYplz3Jy(2l5>Y8 zva4JX(2&5xiKnuMf}?ZM)%TKl;U>jJJqT%?DPgaP5bhCGt8UGv9LugbP~Z&UZ6`XE zg+-Svo)J4j8PA@*n-AXe85Ptp*+P_Pu9th8{4-WnrM`!^@O0$bol3XG4H5W8!|BBx z)PmkNS^SRBAO6_^zVCDPTHucYZ)@1sutemQeW)nq8yT0de~?ew#!rd&ahzKdYo6CQ zTV8o>0gcUc4e3q(RVkY|l3M?2>gV5&M@H`!Y$HC*_vpJ2t&*m}xPpuN*MiRq`DeQ8 zCGLrlvwJk1_=ulB)ZDEfv^($lcji7|!h8Q-&lvAIf2PZ%t) za~VxV-nSd^TDX2-z0cDqqdiCWUuO9SY|3ckcLBK6kt_ZsitHhc$Mob3;+1XQk0fPlVQ@*j=GfPJ&~oe>^Lvo>qN=zJ$meu zAv>J<=|gK|JRYy!txRyN`?QL{JF4v8^MSYDj1=9ea{k@@nC>85G2PmE2>(k@%j+n) z>f3c|j=ML8;od4Cwgs=V9z+EBDeWpq3EV#qsy?hGPZGh#tKnzW{h(Xmc~1%4^)Bcx zr@tMS!z+4p2J?>Yf#>gbD0d?%d)9Av47%M8zz5h}Qg%GJfs}N_QP++kAFPb#Eds!q zlR8#|M5bn=F%TL{AV}uZSE)^dlnr8Yb4rcG#2-w- zb)&ysYxZR|miQVy^`07OE_o3dKsX-8jfAdZ=Guwl3(#Kk>!1EUQnGuaqShDam&w%L z6Ouj5fjol)t5kH$bx1|w@j``@)G#)1S%m<H`1eBo_j=e#SNhO%y{fd>G@@yVoh2l|>U{>?=y8pb8nGj0I|;R5A;6 zX2~)4TrFU;y?x)Mnl0+Ue1t%32L4#d^^7nIrFU)j6C z@VtJ{dz+>=;BNk_Z0akoey=I}u5d2x?ab)?*nj?*iS+7MIHlmBwmTwbUo+Q~?N_h;^sk z5M0V4Zp~W3=5jHvS0M@_$MC2fRbm`#sysGIwGdE~aW4Xz#Fjb&8l1saAFTvn#xnXMW{fwB$e#U)>f{57^+LT4=f6L#%zZ(=3P z*GYCx0MK~h1jOdtxH_q?B>@9-P`{x1?DHc`M{VpWr5)`-Qv|Ku7j(o<<*^*A-3PDv z(J}~U1U=4JH$ZVkLZ7*|7sNZ+2^>(x@*Dw^^RX7RoH0*~y347Xd00}Z8eIlv2V+9Z zKp*&oh=XO8bT9&_X;=+uURyUHz1WWl;rZa8skA)-OF&9b3KSM5g@z=&Ku%aYk{cU# zNYCBiDidWbEonr%JK{m3=o?>YRt!biS8^If04RgNRdA>XFkNFnP|0E|w)Z;pdV$6M8XEG61YauECaB7Z{x$Y2+=z%9VTtcS z#&CV4ASeMyAn<(9^~C}=+7y@}sUzRZY5;6k6QkfZ>!)0Skn+@wwglzRdWN1?S0fQk zaH^mR%Z&i-^*Uy9=x#CkqdA*k7LRGe!3mW%GnY;c6#fx+I-%py7w*)-W6!&tuV)@? z)O+1m@5|a|X8O&;^;!D<>(}K+Qa65oe6Bbz?MP8XAe*Oy(mc{^=wiEXaOA?om1Y*r zV)cvb*?_gBU*FAhH7a9w;6a5gWt<$P4Yol1dB7pjDFQTrdld6|urCavkwDeQ0wOcW zG29^TafZ)!N0)vDnvcY>26Eh7>=ZZpvBZEn>^8Ca})zVC!SO?B9F zIBvA^pXd_&4u!~zr-!;Z%nf@wQFVdeU3%`Eb?$A(UXVNf}Up{$2?-Vb2ZLOz& zWA-wa0{4}bSokOGRGPrAz-H2p>@U@$;tVeB@X~x$GR}Juo_d~55b{2l`27dD?f>lE zWL_>0$=|fzBj(uC0!ma}KsQXh_hiF@+-LWe82TB!$Wtefwd){VYUE+DO1=L?vADsvuyH=Q%(D2$L{8vMU?=8rBzX!fb7RSW#K@#a!@oe%e4 z9XEP`#iHO^IZ8F^uRPMZR6A-`A*zQbIOv3aBgyr~EJE!Pjc7A606{mqXKY2)Sp zovWSUjYiGyJU;UyNK3QFw6q`ra3#@nBHf$k7@eMX3JLOtbnv(fuPc+Q!G#Lw@3I7E ziW4ImoQP}z2TA}ja!U2w0Aj;oIzvL->d9QBmYXSP!y;VjD&X$Ux5 z(;hUp$r^K)ET(E2ZQ4&`nXKCPc7%z3Dfa-3(w#UE&Qtg2^S9v&d3EpG(5qzF+##2yk9KTa+O#&4DI2fvCSK~ADzn`a!hv1^L3j)!eBC6CPs39tzk z`%zL6fZnP*fXh4+IAzdOAbkGYZ6u>>v?LQiLS`2i_`M>WYM>b1ECn{9)=wCyV;KqS zXn{Z`Fa;QeD%gmB*p30l)ZMBl1c71&=B5@@1_=U@ZD}5Ca>@_MTo5`ldLcYgJEH^l zh!ki23kluuvX_&6WQ7@pq-SqAw9P5F*FW|IOy zQedne8c&r*^`FAM&nPEF7(1s%GW|tJ1Ak=G*iZmGi-_aT0oxtGrhFhG)cY4J)v?d4 z=!EPUu`b%(*uLVPx-DUbXpa;`b01;kq%<1o9d=6cmz4P!Cm>5wpsz9;~6p ztlEmA{<`pcfIhl&?TzUJm%~p#*y>py^Xbvt_~{YD?#Ak$#E<2Vqacb9Y4Bh#KOtN0 zW9}4m?i=uGS!CRS*=^gF|uGu{aCxLZYHG z25n2~Fz5P#wB71?w8UsLM-9Tty=K8=!yZ%+oIpbCcgp$#BLLj*ItM8)?iK5b(obCo zIk35#bPqOh-)|hY4@_G8kVZ_af?3&iQ_z&@SprCs z=(a1FSvg4O*$18HrA#;nhQ~F^M@r^q>=%Wo6P57hrOJFB<)bqtinwdR8SB0i*uW4% z2+xJt3!wRt3;wanu5&^JWwM+RG3D=M7`s=eaY2Yu@pmKPD~(6`QBk#2hdzlBnq#r~ z#?AcBY9;JM9?S_YWzqw;j~~@;iUm6O<i*m@vz?_e_49?Za?bZZ)r(WRGf{BgB zYLMW1*@BV#Di=G^34kExZd-##p#tD9Yn<#Xj2Lu(!6pOjlfe?~sz!}52ie%8T;h-W zLV--!enn%v(x5f`NEy+DmsJdUuVN^%*{-TV>+B)dhi)Jduen&0=WW!pp>-?H$H30w zdFSTlVaovT`-zeNK@U(4TrK73hRTyRJNmu9?)(EWeoiW4io6q1C3agd`M}hLJ1JInOXT7G09$0#}(pD%2I8(LU zp&iZO^~Svi-Kie|#;1zPCqUZD$iDgq)^;5qG~~f_uTr-1@tB0ks>LeP+@#o=Gw;tX z`_CK8r$gUvH8RO3+hX%_=6ph|8zbV6co07c^7GxiF0Mb3pOAU|Kgc1^&wP2``|Q|% z2RoCYSBurbPj6Hty>{Tdu5R2XCEM0FC*f`yXejg9k$$fck`VY9#IY#ca>bUd-W6Pt zclbaRE=N6O;&LIkzS>;r!}%7haKQRg8g}l=i6IRc?#ZSlc5U?3@BH`l8JZ7g zqs<2n_5ISFtJTXKWygWVtkP|*7lVg>Wq*&AZusmelAkkpEhL~4w5tq5`#aDS{N|hp z?ZG-?wsFF>wL3zncRNI}UX|nQYeQVY*|>3Sdz;A`Gg9q=3~!S&5YysD!6tKa_EF8+ z@df!$xt7JW+;X0MlLINsCv!rqoF;LVeWwGSqEsEb1nE3Ht zqhET2b+>{)rHG9uN1D|%yyLbX4&V0=bHQK%mP(X8lxRl(nC^19Aar_;I@TGW>;>4> z`_lR2FSpgkK1{t#G{k7kqE2tB$9Z`NJ#;wV0Y9oWr}L|_IwPkSUr`T-eWH7KP~xc8 zL?s}H^GgRf(gsp-3Li-+)b8{~R`s~;T!fsxRmGIkQGXM!{z?FW(wUS*s z`LtG`h=6`Ojy*{}(IxA;P|_80B&P%fRnk+c4(UpqyQ)l2UxpVF6yvjwa_=>wU}77q zq<8E*)c|-)HF!o)raJks2Boo@?amBo8uc~29zs{4Rz`E%3qkN@g}gX=x%2nBPF=~& zan9S9uk@)OqvGFU*j+3GG2HI7>L-JRh8HjjCro=NVV3v^Sbix$lb2CnFH>Od@L7~JXr3VM5N^Y9!{&%^ z-5saBw?zMAiP-R`CRC$l`crBc`mEux1UqwN28!8GcK!6V|9y&G9@>kFyj!2JOI*$X z&)bTVKh0tY76ZlkDK1&p7SR^n7wL>mtA*fstIadiYL3mSmQ+L}Z(vk^+P$jm|77{4 zn(JBl#dagPR|E2WtKioOIVy*`a!9J}-4h$*QyO5rwqm#n518Fi8T~XAo1u z`_~)~^oc8@`A@UYzXVOqXEc&X`O1&;i=ylB{J}hJ1F8dU7th*lj0**ha2TU`1~h4@ zJrcSkHd9WOo|y$VGT5up9I^&DOZG$Ri~>prt_? z^DKuH*00e?67<}|LbwMU?Af(T0`spzBdvjq4A5tb%IV*>eE?d(E$gofU`?FR6+`=m z%a04q@NIlc1)%pssJdee7BVupDb5KP%@?Mq6B0{?HAtg$0F&TnU<7A%imeRXe#)QG zyqKS*Go5lD`(Y5!p(yxOfcdFTB}9ESRs#p6$X?L$=yZ~5Z9KewK|eNPNE%T%$VpDD zv>vXe{)mA*W$$&xzvh(nEk){pnEaWl%IMfyPJ6m7c#I{-bXZ6|u1SQU(W5SD>VU zy=zaXNc2DWi*5>iPsE^yo6dLd`BuBr_hIU~^w%!ltBb$tU$w=f;qWH^U7#V5+ftXR zpRrE$=9luo;I57huRm=J{5$lH4M^p#MZ48H?BlHs#VYIixzw?DTTvv2=aut5W9>&T zPkXc1fUK&6dYyVvqH?YX$#m7zB}K)H;kCZ@Wy8Vy?`&dm3AQ!5ZQ|`TFLs2E5b#<{ zqgaHe-R|45j?pSc(FuG&AMpZsd5o$({&;l^Nf!3JkFGxo!a80`kf z1Q`FX2@};m)9Rn5xtIKJU$@-IP^mUG-QmrS`rn#yo9%|Pj29)Mp3X5ur9Y!*`NIAF zxyHf|QMnA|zsXVEo?Sn$Me?hhwDyc9A)+RyxMhbdhSJ`W1f8*yJYh#6t?du`Tj7rJ zg3(Cp=Z4J=-1yAqD+Hff+P#K_RXfMON9+uaMkOzuDw3iz}j?a;Y!fNbx=Tx1$*M7#QcLEFz1%op(8w6z!{=2!MGUXw7~yWGzf)f)EMK2N~JSx&hua}m3LdLJHchhV04lz zfb7o(Ub`LVkAz`x(pOboMk_D=hfy19qh}hX^xArRNJGE+ySTfMR+eh7Z(h^n#CqAg z$Ies`zVCbi@<#>Iq2V%VKERQc^DY3q;(b%@t>L_d?0U32YK08RJ&_qX(Zw}Z8lnE+SRQjzXr<+k z%dSzJ_dS}QPeeo(-ekDF87RJoWj=1HYD0dqoh7saTN;v_qoN20NJ6_GK(d+6I8~Rt zLPix%d-X7&UA9xc``SNC3on9UzcSd6e^Abu#A(DPGn6q)a<_^(i2@Vm8P?}!0;~+B zF+$6~zf|0mW{u{lSbLXBqXu&#OAs%Gpn&)PqeW|yH9*<$f4AhzhJx5*%Tm>q6CfD| zW&W(=(}`)0z1Qe3@qEM=2lAmYu5v0EB(tN`%c#56i==7ag$9iRW4EbxEQ3wJ(sOzh zw7q({4mG&KoTw=uC9y?T{$q?HmjWQicQnW2eHAu*JE!aE=If-l>!d`nR{?;Ps4XX* zsctw`7+z?OlAYG#CWFM0#e5GbtQXOupMd8>@z^xH0F8Yzp}^vgC;$9ts)q;#CRP?S zS~dfy$D++;a)O$920N@Y@n`j;4k-^NhR}?~3L#p45{&h&J^V>F2$C%iOgA9&AU}l2 z>`WWFszV5_%ieEl%nyaM%ro58B53I(fA^g=BGuA)-?4Noq1zY2(*}y2t1)sS-=sXYU+S^ZrG^-hgAe=%eV;Fshny7$Kl~5MUHcz2wBw2YsKepg zS)$E(yc7Z1uGg1guCG9-vxtjO!CIZf~bd- z#@13UtWz+krs7W%TOKX;_5Q1VAOpxd>e_xIQ$>QccvDo+e&HkWL!0ZP^eE zXSs~yy_P`VW(g`p$#$OqovS^aq;N37 z9>~j9IhC?O=bdaR8W^6j>J)I(H|;&CikZVbi`lwdEE>dgvNRfF!i2Zfm4l^Hf;V{C-YNPc9_Uh@eDniv*$+tW00r> zRSh#@Yaj+!;91P;0sykz)9(~h8jYBqAmLZL#fh*~f7^766c#yJ) zbqLHQLDp>BA@ZRma8QV58jh6r?jgwTatTiHv2IJT#!FPUQRbE)Kj@~}6-c@BQ|w=_ zrwJP1+)gb&j~UJDqI1vZy3~Ebz+mslCvbwO5iSLabC3PleCgi7BbHj$%WHpW%NK6k zFsRGih+8u|mNpcOKHzhp)B7rG^TN_KJ-f>Js(bmjZPeHPGCpwou2z50#*Q^@c^6I? zXDjMacJv)#9Ki3?%xV7Gn!KT#?QZ<*;X=02XdzCU7~u&@Hdu;14$3Gb(LLg+j=iu& zq(T4vbCnCY8&C>fh`{aikzsU=+o>%*^H(##F5xISg6*T_U4D_JCfVXxNPL21?(*D) z&7}Fce7_3r6&U5>D}u^R|Hs{%$3xZs|HEet23hJd3e8v|X1K~WGWJ2Xv5#a;mN28N zV;5OUH8BV+lVxJY7K#v+qOw(E8A~Neh$bN+rRBQcU7yeQ_q*@=ulv6Lyj_p0nc>Vi z@Avz>&pGec>-AjV1D0OR_mhJso?IDpaf=!hES$+h3#olTdr0Z>cUimAIL-GC48V5h zZ3wn_{LO1m1VXA;-Cpdu(Ug~PgMQ7OueOQX=&qU-lgD3hGq~pQyVmyE#_k|g=HbAy z$j97UGOcj|ui_7gR<$j;!)B3sC|8dq)OcV2Exd1KtyBCR#YA27lLfphqt-EL3>0iP zBX;*!KhIH#B$&PJQ1S~@O?PgtSA8n+6tg}12OYterjZ&pOImc}!vHjpTH0p$v-c;KLka!^SDvFQUW7;>9LTv<2G zK$d^M{ycD`UNoB=FVhZnDV&7}`2#^V0;T@?gtMq>Pcu?3b;zJ_=>5rK!0$64Y zPJA7T1^L1uiFZ~yz#QsyDfUVHiuHJ*GyKpR&k2s(!9Y6ml?ob=vXi*8eA!VT!s6Lx zU=~aYAJ%J%7Y3G+yG%y!5Alik9OIG}z`__fnC08$=@d9@G6m#D;fz3_`HzAVBzOdn zt_!P*1+i*&ATIM}3)))gkJt>~+yWyy&)8Dodu}Dx8;I-8)2G%$fD6SN8f2V<1%9B5 z03`yF4HS^-MUmuvn>)=o=<1~|P%AGU;`?kx{=H)=y}#E>0<>O|1B~V!6#3Up&p<{X z2?8w85#h~D*eD5_0Rq!{$dLDy@ZLqjG$1_3^)aJ`k*5t8txQ5Al6+q=$yzjL>IRzy z?8BEvkg+V|>K>5!5H7n`^bFT`C!;)Ch?H>|;cN4c2flfYl~9=L+jh6Oh?p8Pf@@0Vfm*X059-Ibuk2DTVa6 zhTvz>iC-&PQ<7b6-DGf|slYAku1I7*a(Igi4Bdd_ivz|ezH+W~rcjgwd%S4Z#p3K2 zfb&JR)Ub-pe>n|$1~qz3*#7FmuXw!p=79Zam3Lw*-&6KR>K7BCPTkt2GXp)Q#Qw4N zizbuq`y{fbD!Uak4BSdgFTBTp3a$4V$=wR|3;y@1NuPfkTA$1*oc(>Z#Z!rPGq$Yh zsp(7O8(H#yfoJC6O?Zcf3J9DeeQEfYS1a%MQk=|elH8!vBZeb6I2ylT!S^9cXd1A) zMM~e)NC%WUCV+qdBePR8p|Y=IwVR0GjB-mU>BxhJN(8CS=`9_#KA(Z$u@89O6C;&L zVFQWG)I#W87xN3KhDLILYuS}K(vSe(E}9vkLJ82=V_3b&3dZr^3!=G3vmK@kzsRG4 zW%;>g_WbG&18>Q*HXGfx`+SWY`Gg<8(&ko-`y7ehn@udwRu*L)b<%h1-^n zinlX`b%+vd>w9R*^eS+-*g+BxRqUz~odMBp0a46K%*e(6`n&edszax|eCDD^@YTr? zr~2KwN%}zO>keCHWuhQlf5rCc9rHsYu+Ao%CZ(d-xgAFi7u0NPOQoK7Uy)e!X~nYv z_Bq(I0w44%rFzHI^3;8It$Iq9;PL9h(}8Pg2J&7n8z0!U?X--jHqPm`bHg z6wPO>{%}`a_%>_KJvs5zl@`~*(2GM~9Kkx6Kc=ga4@)J$1ErbHr@VA>i3IpM&}zh2 z;Gm2Qsu`|C43g=VVc}$x=c=-40FJ( zbz=v;U?9(OdOP_I&k6V3&l#uT<95TYX5ZYxyv0~3(})XhSkT+=!Dml~D1CRcbrr06 z{YLk8kfYg;$Kx$9+WfxZJHAteX@mYQO?lc$Zta1uyQFKMv!ZOv3);pb)rz{ij6^&u zKdCiNZ_VGa>2A#Vm^8ZoeQJx&G`xUTRi>jHvr(z&nOy>0C8O~(`DO!Z#~f2W5-mmp ztCAW~iHqa;l6+Tjzda_(lJb@fl5;Yj5rj9^YZczp9aBF1cn5O z);om`mRV$aZ4^z7r=JT#}Phju($_XkZ1P zmLp>u$lGZqdiIkCS?ik(2bU?^wD#sI{{W^=;tqzmvtL*!5kbYOsL2s3M{YFDb<0`5 z0!0EAjX6AqBP9V*&kuYFaUrCctABtW`(~Kr>8dGd$O41~NkvO0qCi$T;s8!q7 zxC#cOtz+#aVGP`X_+n;p;z`_ehLv0iz!k-d#Pe0~6$7A#J~PQ>upToFbU`7^FAAZO z773Ih79y+o$bCO4!ti=HD#?9PC;J+uH+IgmTyBhGF?*n<@nA8nvFcRsmo2}s@bWs|^tJa&Lw{uE zidQG!sOky~{!DiGJ$p)x!346xxrvA$i0|(Fsmd|W0%4ILm3W2aXsL1H1&eMav%pqK zsHg&Go>b*Y^^pXzj8o9{eEqT)q~kC-{J71w+1P$zxZbkO4xHH&G z3b@QRS+pgD&fW8ep}klB zX}ZW8;~IQ7mZkK$Qp*~_u;^A?qQ$?ru}EO|zb?gU<#Vv_q6lIG-rPbG^@9M$m<%%< zrKRvPJN6qmYfa{_ILfV0HQ#tl+IC?mxG-iKP5YR?Z@DLh&8^z>pI~THkVpbf${6Xc1n7sAY`w3R{C)5J&Cz&@RWb$qjz)x(I`9XQxRU)$K{;%Sm|$KEM-#a^G~%t#W4M-%R&mp$6PIXa zz`_cFEFOX51m_={_nOxJgM$Zz{C< z_T@Zo(muJQQ; zyS=}8#8oN}=m-|BZLlxxYR$iX?X2tf5hq715^bpwO0J4$`cD<;Uk3^DL692%G~|ex z9xeuFPG50Af|A)ImYO%c^cHUtiALiF$4`SgJ0OHTS!na9AQHY`_$%#M9Atxbm;uGfEO=d zaV43TOvVISZoirQTPyoE%AFE&99`6QM!$C|Vnr5WU`Y7k9elOfsZ#FF9Xp4b>nfi4 z>YFBvUj7Rz!(Bs0JGDgJEoDYD++v+mdUt4Zvr^e2Dsr>n{_?ARTM|SNoC)kxOlK); zsM2&w%NZX~*_3eE;Nj^ zh%g7yKy~Pb_5-$TNDKPIpHw1Ykb8}@pA1dAW*K9ztXGKxC%~IT*K``u7s!(0-i^=* z_epf{SW`R!Ya8ht^dbmSX~GM8(qJP%YtlDr$vO!F>9jtk95!_W;$O2d7<%%H3$AD4O(IBF5=MwLKf`}14td2<?!VDc3Ja)G!ZDF!EM9|AzuG(BqW zcpmWS5oQ^ECU+~!tk@%BXBfcY54h4KFyS)N0Ad;8MD#N?^`&$-64K)5bJ66ajfQk$y^7?4g6<*m;MY=ey$T_Zfk>#sA#fvIrz5MqHKf@e}+&F7=mmMuU9UA ztqQrB)X5EI>fQ?d2YA^sg zpYDq`0dmv=0JpcST^(igoDi89p})q>x39_z*YA7wc)W2V;N)DtbZ_X3K_j^obtOQC zJ0hg#j2W=T%}_4%u(A1$CDMZHrh!Ny97+%Aox6~-`O)p848SOcF^s`&$v(B}l?X`0zH7?H|R*kwpJ zAjPwK#G$G<=CSDgsm)AheSpRLMLCCfikUT9wcXQC;-F0m+ZT|I`5A*g^1^C(i%PBb{MKT*Yd%h-nJcn@!V>QlDP{xP=g*M8|efjcb%J7S*gF;d5oiJAjy zy)WyR{lxvdjH&ho90Yh`nY8QH&+OC3{UYG(&IS40yd5u9KszkE3qXC}+z(D4&wKm= z*HNRtVm;&x>%4vzoE_VI@O-E;e&}UcFptg8GjJVr3JoYeYe~24palhz!W2NejWcmEhcxCHnwirERQ3z%PwwCy%6131|8T9Sg!a(F|4rj>-)9Me<$CE|!gVEgrTBOpdK= zTEML>*@*G!;T-Jdy*<6rd1dd9$hJB;8Xr&lQQwBzG4#Tk6}o3eW8eEu+~SYP{L}+F)H6O&4h@2H@vh<>*&+4##cRb}c#ITtU1A3b?{olt+?-8M{pi z-}Q3y%`+KhW^*(H8t~DSxS4b`)MCjpO_;^*Td~beBA{!MNwKUgkT`cb;Yv{8(yc?a z5r9RBlaoLO7pyq*%`l=I=Unp0(vu?Lk(348Ky#YUDvv?w1Ayvo#GW)snTo~3JEf;4 z`);1~tqpCLVU;|3+?b!IjUBfCY};WVSAJc+hb%7K8)YvV95A1mOcgL=aHb0db5B z(*lc38BqK$9eo0yC)USH0a_965i|dps z^1YHMV2m-k(OVCZ$y*UG{ihaxjIZ$A-{ijG4>zt3h8GmaNu}3+G_jYnKPJrj(HZMv zw$~xvMZ2-(%$M}0x%j%&M|r6+_r)pMyFORcEZQ!avG}+QGHbwHzse5@$YUS$LFy?} zc@`j@G?a&F+A#J?x0cx8O)uZ6W^n-p;0Om6RevZB1q?xcmtrbGX8vNDh%Z?sjSPFK z>_W%ta5RYgP#|NQtj{kBOQM6&jT&i8=U%Drui^u!1OO~(&aAGAW$_IexNw09j7XA; zwHcS@hf2k@gF&Z!|q3hL8EmG@lO0(C-FJ6lblZ80Pv+UKF4P00vI-nvIT?KGPzd zXJ%Rig6FqMf*L2dDVX1fU_kOwC7!f8RUkdfMBaySuL$Y@yH^2K>VPE#SN%zOJaw(4 z=Q0CAG+?RUMxy9dxRtw!O!$N3A$_Fe8Y!Ki2%&|mJ||St&vJ9Ga}?yNglev=LC zj|aeEWXtt-Ck~RRLY6)#OjRDjK=_I(kVweS0vmoe9O7%B{2Z|&gA1g6iI~uf8^U}f zgFK*bb6OQD*gOZI=Fk zdi}*XBYB30NP(0s*~_Ryq?8fglko=ZMoxnb^1fViT5LCz{GzfI|1uColO`?D!lRSM zgv?ER78rgD`1#UK*^pGswsVeRsz6lExY3&EryhvpVr_A01vX}*uC@x?tr5t#i(IW4S&ypq6lX`zO1NnwVk>QqI1-U1WLR`0bl(de>hZ7tW=9=ux3-=vpm}2 z04c?CfXP}Eas&`w;A|PyMs_b74gxG8Cz**r_&%Z&swQN zqXqz<+5umM?UK1sSDZKa z*17!tWigVZj*%@nJf^Di*^`r9zX(6ovn+}b`hp#nSy4kbPrfJ)J#6}FtMFEg{Yv&$ z4bL)tyBxWbH3))(4W;d+Ndeg|G)FQlvA0Fl_x|t(TL}r3)Kt9gzyl!eoIDo@eQmH8 zq$D2*CIz1IkWM6QB3X?Fh+pn^p~t3P^{Nj;9kX#WX>fI3G9XN+sVF#X|pvTDTnb}{3xoejrj)KRyo7+l1{6ozTmY1ilfB%6?E{`YWCdjvrlJNfaq!0MS?|a!=9;+ z-sNVnQGi=f!o(GoD+Et{4R{ca^>qdklY>V+6_#XhriH(ixp;j62O6) z6qa<5r5_UUKnKBY84H!=Kss_TP&rP#?lZR|&pJsKX3OHIU9=TFyKl22CDefLQ2_@B zo?p|egr4+nH1E|cawHExSsN>9M`t)diwDGe6f`e6X?TMv`mN&Y4VxC(jEeCh_73_( zt$JbkzaQy#qQ;YT43BN#8sZ!OPGph6c>I-plc;&|ujBPk zYo|peLg~^*&%2~d`IoCvW5Dm-CUmn3t z70+=Q0pja9X_Pk~m;qEeo(;6J?N@2g(IlOF9s3j3S-kN)Xc5G%h*`7Fqc@2{h6$B$ zy3j0*08;TZ2*Bw;EJH|~t;Y=BL;~SMC|MSotyfL8#0)0!@26PT02Pomq835SXw?y( z&cTh`4M;<-reS67Irom>PQyo?Aw%vuE*gOv24N_qyfzL{R=n}Tb-+YKysv=Ma<2U- zFYUmS_{cVU-0XQN6#`9v)ZB z{#kxYSc&l=Y-o4Fm);_&XXjo{T$*Dsy2nh87a6yXPa`UiBi!D6?Os3~#-GaOHguo( zF;;>vYrR>ic1IeH8q2}wC$(<^``2yy@7g;`UsYe3o?+>3m&r?s`t#(R>NU`5Kikgk zeP&TM)rz3n4j8Qf9; zS;$L6FweTv_ydZOf6>*o6 zRW9%Lh`fiik=_Q)v_olz1M=3X29vCv@k7JuDOu8(2m@5Mg~nr6vtPzfG59OU)aaU$Lu7P3nw+!zrDVj zQ}3JDdRIbds_<^IHQs$rp2#0pS3&M2zQ6AlH1D;Ko6C@FdHBZkSJKB_^ZN#Yk2u%x z+#JoUurSPI^hsiyu^5(p3DMeiX~OB@?m{pXk`7}m2)?SEam^?iAo^Tp> zS`Pc|XYGLnkkvWos?P~|o+BwaAYtK^xZ*siGPl7zco?`O7>(7<+<0B%W)wJ87YF=V zve)d2BcmM9*e5}eAO|_^GP>q+<`Z^0hG~`^!~ST)k!!xv*Uy3PVzTz07jQy%%k-Y) z&Iu^55=5vpLIQihQpFKq<{Gda0F2(ED#N2er_URG?)q)-$F`+wIrW^{8XX5<0?iqk z>vt|{^$(?&zP=-y8w-kTa>-ok#FgD*v;^R@BH_c(p%;q4x`5S-*pl580~r26cEv;1 zN!foHQ@X%{Mq)5Dp}MOIo@pa20g*C`U9FAD3PZb4ub==RvjzvVtP6O?r6Wt=1*LOpgq&5*p`Wk|a?lV=wt>h9bee!b|% zRc;GBd`9@%X4t0pkIK6?A%pzs_Lr~s9rZHKd6n~<_uhN-kjU}d4{qBn23!oxz5BTK z#cm^#*YlyLg71yNTHp$>QA}~b;%K{2$3PB-X6Cp=IL)&rhMO8_anry;M~ zK*eTO2I=Q4PM&iHPL{`ps5D3EweLeX!%SP6&;-Txs$i0{pAw<{1lfOS`tVt5Ra;;kR0}n=3V3ekIf{F) zMJ?LVjn!Q>(fT~=ldkcphxlxP41r2ss{?dwBuy?WO%~_?rBONijj=C5X(^!!Mk8Xj zhpkwBiUTs{B7z!@%^^5RTd)TEf;9}tyMsIpmRK7s!3=UAr~==V27yp+5+e0gC9Dsb zyk5bxu^BKZ7;=Wcvu?{*%Kqz9)REFMKSoMZ`OksJ3!ybEN;y|vcXa{J`!&q-I`Mkf zixyDUY~m9+3kPET`;I#6QNO!9@ON^W(WQEi0s*b-k9y-w6CU4b@nd#tn$rz|d@K-I zwFeN1yV)_71%#ObnQ#hFNX#qV0j3q8&pSI{6jp7l*%1FK(2-{x1sFM03K`Lkyi#E_ z(k9|`GbzGXhe+sFHM2J>8_2XJYT{XZsmKTOV2S-1n{Duel<&}ohqH}bf zkqi^qpFH9b)vY!w4PZ}yrPTbwf*Q-5EaW@-9`_vNr>Hjks-;+KBT|Y1iU0d;jwUJr z<(yDa57Pbnfr?mQO=X${zN;#veLhW6S_b7}ruvzM2Ih-{NnYlWCVX#Om3Uz{2T!#(&+5~!XwkS6 zO0+;Z`2+4hN}OW0GLa@qWw;d6!VTvbkw(CEV#TbYI3S;{Z8!u;;8=7FfC5@YjzaLV z`u$Yfp;baJ9x0F}8p9(d2h$&AL<5VYUN)v*nq~SlEdkf*+AJE@wZZmvKrrX@KtT;s zMHVj<5D#a)!=TT?E*f$J7r$|pf)dz9LyCqgbHA{i7ZIT_BknYK@%MZE+Q{|6N-E}q zrlky`m?XPPZuC8WJWJjy@&p&wnYrf7EMPc(W5X3(VU7UD#>{6BECCmXwM+KrzL+S1 z9p!7CQUkx1wT_@H@3%;!4ct}seu}WU9PuNJH{KX)@%z$)N3FNiB@WGO%74oA^#85> z@#o%vXoJyl#4lr$tm_;H|Mb+-@BcnlXWQ>59ZH;dCD%duWff|a{Gtp!aHnyw_`Gy7 zgSgl9K|yKVsS+7PN0LhtDk923bCWs84as&BG8f)*rP3+h^@>1r#ZM6cpzb{E*uQca zIlvlHFiz~Mq>CXcfgcvA1bz}w(OR)e=mF+h*Is1L6h{G7%Xb7p?dHiKD$V85G#4N@ zEneU%%Ge4fqwRxc1cb#z`lN`(2oer5?lYGdVg_kqkHc^q$POz@bd5(~6M_rW(5&=< zE6Zoh_K`0?oSgz4m5d$q-gan;8*tIR*V(AR@qD292~mM-WphXs8yhi@w!L)pL17xu z4rf4#?%Y$vAX_1$al%&Tp8dS~z4-fJ@~t~{1p7=jJjydO~Uwzht0TG)3eWC zUm2_rFIot-d0Hxj?}i^doj-nMcO2y(s9x1#GZKqby;Xr{pOCbD@fvyhOm8SbYm&QJ zaT5@G)6WSFcA!!PvqjfztRx%@g(%amJ<*uyQnHdM(J#|z29$pT8O|W1f}i8Y;#sC% zE+c^FQ}C9709uliw?uv=6u4^uZn$0*PV&n-2QKx%H-r9)XJaVhOYsoA)b*9!fBo$B z%2}m!j^-{rD12vGNf6_GWX%)0AqAcpxEf^5jWIA*64jwc%n)+5F?AuFR%bXL$QH2= zoXYzI3OCgT+67;yF7HszV4}Z)r*N6JVwSF z>$P`5z7=;y4?m~IDE~gRJ@TNomNiMF7Xng6qT|euPOgTy%?~t1+Ig9 zdFiNpsC{ujo;X~I<}c0W&$R0SVl&OL%vt4FCMc^5YE98}y7dDSrLJyooobuU%UTC2 zJp%@G3O3uNuL3{M&>1h)r11HV834H$Hg>TeP?d{GHu11-tV#2=+dP}3;B;Y{WAaG* z9tL$POr%HDLLrGavG!qTo8vXfr7ubRZ4Yv-nJ1S)?I!@ zY|+*d+4JOg;Z^=&OFq$pPNs_`q(c6PDNl!;5RIuXK@hD9eJ9Zi@_R~0T?lBk+3=q+ zJWtt%^1p{#O{iZg6T3FbX=Lkt7Y97%8ugVoD`Jt(&ag#>&J+zR1VBbI~`~>6~Vs0H}d3mRbleNEtK`Go5dnN&mAsV!;vq7Ux0` z!sFu0X@Z)K{VV1>;}_US=RJPff`%iFy(fk(;d~OIzEOi-7?vVJR{+htBw2gmr~R57u{bt8Y^fz=ePYDv z+0gG8aQ3K0JwPeIpco9JA~K{5GzWTHuP`VGdS+o2sR>5sLg`f=(cly~hn-HOWS49a z3=@`!>qr3OLP}V)0mFpyjtGr|?LLciF+1q};H0cJraZ$U4xgXt2M69u>!8&Z#*e%9EaygR zeH8Qhgekwk2jk|Iuv{z^0&S%^Ru?1U;5p4M_QD7pD+){r*jasr5pgrnd#$KWbPWOl zM% zM0kwI|Gd!^(K-PBM*7dM^+dG)Z28|$I1w$~|NTizOGHQS-;X9HA{x&AF)=~W;eYn? z=c9*HKxk0le|Bpd<{uJtKtx;nKi~cLqn4(K&VP>cXKTxZ7;D!U|CpeEn_+e$lI*W3 z0`~v!k^lXuC8G78um5|j=!ihqpcoNvaM*)ZA{vK+5@JL&{@t(ne_t*B`)Vt4)^Ei>;xt(EG)WBR9Z|-T3ZpNsQv%?=buh!hY+6! zTmcT-1M%&E!FRy^c>>9Ub_&4${ek}b1LK493kV{FghjT24O+HCd@wkij~^}|z|Rl1 zP62<1_;(2WrKn{lDCy#l*b{}+PS3wCq-5UMB}E$ktgI6dogpl;bC|PaB3|0-N zdq5AbZ(woI(#qP#mgsuO?XbItr&r*ypx_WPC6pQy8yBCDn3Q??OxD?R=V=!U3X3jX zzH*gOUcq8lR#n&3Hr;M+X}#0de)rMiC*3{0ef>|lBco&EFJ8WycsDaU_kMoi1MlPN zm$mhcubbb#Z~f^P41)isTj1aSbnO4uuN|OYeEj@ye#D=C!T93A|L`6B0*YFKf0?-; z{G%lIXr~Jy&GWA}b_px#kUmQVL=TJXRK~sAyZWbV|Mu+vS;sQ|KlSW?b?krlYXTC5 z!@$Xd?|@99xl2-)OxkNNQkp{EXaAPko6YtKH*!B-xsBpJ-y;^o-qn7!%evBb{#}J` zBJMy`r`qv{1}XP6Un0l4Z{^@cwASnY$~ph=YCoAFIj5XCzvcGqp)m6r#{+m&e%i-8 zgN_tDZu3Ywt~T0BWT8YtEF$QQock@^o%Q7wVZl$B(kOSP<6i=A)4QW5-aOY6>d#B* zP?s&Zduzm~!4aLtGZ){eN(_?KSD2RM-asBan#3V{VF#Svxn<5YA92o>+wfd^0Vs!l z3O}rtv2~wPTeM$ONf)9VAFCf^6^~69JLkhlFj@+~`D5hNhH|Eql+DgSXG@8l%cC!k zN|D8N4xN79zwh?MjcN0-rjJ^Of-|3oI>gBa#a{KMWrV%{#&Zw6r4}tXD`xjDEic(66|FobMq+6FHg9padz>km>ZIM+hS{09Tec* zTC!UyBbe{ks_K1@cP(}`#0FuDoiCFln@h2e)i(2eE$b(ZvA57GiK)?viwyyVaCK86~eu1^({8?0y59xlF=>W8*Yxr4}B6{ z$*J9Yf6uUFzg%XE`QCeadvr55k8Sh-GFC*rf7ZCmm+HCkZ$fpa^VGcWJlP{tH&?2= zL*M?;qA*QUdcHxuDggVMbo9~fd9|LXH0{r!$fhUh=Vn@-V*~{-mb~#Y-HaF7`vVQ* zFtra~&AYS|h8Z6ZB~Lc0sVgnHYfhagAYT|~6rJ%GzqYGHc(-}?PldzjZhp?6TZ)67 zS}-{m#NO(LX6UVLM%lJ@_Kap-3;L_$QTX?^L8H+df9p9*?AKLH{9c=4=ajYKR{Zk@ z$Db1~(AkvhE<$7J=dFsTx$`I7%k0A4UM9(I6pn`7Z~aT``enx2y$=<7 zq;pPv9y1pzoB4y454wI8rMf#jEQ*-dgHP3F$g#7}Eum|7Dt#l|2sWY%SowdgOsmUZ zUOcRnE0R#-24u-<_^zCLj_&sOJ=C7HkM@o4+I;Yhfyb0yFI<3ulf+{DUs$^5wwoP# zzIP{6%_RbTI$Bn5ABG#PbstVKPuz?+v5-${Xy(7L|InA@k)rs|YNzeL{JcLq-mGx? z(bWMe$5~t}Oo~5GxxW7=_C>s)dN8#|U6%WmvW)-HTwJ&Fz-%jBw;y@k)hJRx$wXNQQKKB{V3qAyG@AH7tOf7QW(@6*|-Yl;qS09{eH$FXTYH&p*` zJ-8;%-Y!Jr9?IRe>QzcZC)4Lny>12kLPw6ZbnWW%&pQUChTnWk<(2Jr2pPP7#Cp$i z_UsEO5_|t+G%WeFv@7?n7{ZFOTy!BA$_>lvMQsHqvmNYPv1pXuQx9>m2+E7-i+pk|{jXk2YAzUy1` zJa<)zd-tKZ^qbD80&WAt;Z>&QJneXZ9k%=6ely( zWT~7ohLK^>_qAx(<96+dnZ#ZY^WrP6g;n|cRLvC`9U5LhjefJ3$hk3baa;Z@?r&3F z`x^zMS!8E0A3YSlW-3^Z7f@@`SkbW&jyx1!6;~Z{64ULP+|)s#3N(10+&MRr*^m<2 z)IY!9Qu(B*7c>M{XM}4fJ70vLs*O1}XrPFHpoZhrT;P1f+!K#v-xAIGB$>%=P+#q; z+e<%qu{rEn*gV@OY}ftgR99u%FJrYESoLo6Z9Pe2Cw6Mss_hxW?jNJNJUj4PP4(tBM*-9KqAkn$oGq+@y_`fLz^+y3E8@}lve4=Grye^>q;!1#F0-=S{>M8 z;KD+FILg=`_@Rgpd6Pq=+*5dHk?=hxb% zhkChk$w7g=M!(ea?~P0t6Fs)nz6jqWHM&p_K+|r<5tg@Ka`vQg0K>h|H~a9%#Gt45 zUs`PLtnelYMse@nSWJ$+6LZE?(r23|w_`VOw`5C?cIicjI+w011iL*?cE?w5??IekjFoC= zYpe6kPm%jy^oi=NIEy><^jcIivp)feH* znG+WuJi;EmXJ6`J9O`X%C|z5N@%(OEOdAxbw$$=7wCuir8PR95t};hhUd_GZ04L7I zw)S(MN?7MH*|r;L-uP~}XKIFF51v+rqk6*!{ST%$r7)(J>-v65C8g|o%wzMtMYlHg*9{7 ze=Uw9e$FSYy#3{0wpE^iCsdBJbB2}R0OGkVr$|Qa z{vDpk_pqK&+wiSf{udfICT};yt(9U|ZgDDGGv3?T_L;Pt-JdRUBr*Sh4$pIZE9cpD zE7kgN-WR|9KW~#CR8f_eYV!4N3@5|g-pHQ7BInfRqeleYUXH(O)tL3nt?O&b``(ju_($vePU(%h zYd>rz4swJ*xv4m&YgepJ(MEFnOdjEQYawvHTIIbSTilQP2I>|sKD#r|bDx+uyby7X|HVF` z@0Q;5!*H3}gRdLa8&ET8QABY|fy7E!soJi$#k3MDtaMreQq$7^{;KPf-5tg@#tg5XX;GohnYdT(b9kothLq@&@&ei!&tZ;1eBcgt^@Ms#wVZ@*303vVG>kZXc@LZZ zHj3lWpluhg2;Io93GA#6Y}H4_)MElBGIh(%gHG+c6Wi@&){FaG0m{PjrB52~QMzF$ zZn_=eY0sFBOVjrhwcnF8tJ8X+fA3Pa>OPD6t~4nj6{P4g!QK%ym8LoI=1%gVoNc(& zjmEET>l_DkliZ`CJfF_i=AtroxHpB9ez(C|*0N8^z_!4SF`@XuS}KF?pwBv<$oF`> zYASB^Q03pslLn(7J2O4Td>zKuP}&V2m+$XN46OPVEW;iR2=eo+m^5+qJ#xcacjT&e ze6qXY-Q>iR_eQrTLl_k-fwtZA1_tuASkzQJSFQ!^!Phe*^0YvY@KyE zWoPaA)_9jIt0dlwL28qy(7Bk`3|t#J>A0l#$H?d0GEBobDPik{6Qw5Q?t$ds!+Y)z z%zSc66uZyAQ#C|)gs1(6{ILC^@jLp=Gk2*m;SaHa)JEGZM;%(}n;oeKeMG|UU(iOV zXRmWSpZ47A4Xx}vc4<~d6)}E2fv@Lv)S7q;uIcOrW3xKrc+cOx9u+$-Yl};;9bTTG zdOg`<=_QA)-jaJm8Q&_N-?Ja}ou*dAS(~oFuQ?ul8*{K3nw0PG^@nnzmpWKh8YJ)UxJn5UVyQh)*ek&bkyI8S86Bql& znv32!+U)ZY8f3KEL?T^U@}QV>ZlqEfF{h#R3-I6>LZqA#=6Hzg$(8cLvf}!y?p^ZJ zuY^}zL*Utr?|bBRMCk5ry9sr6ZLhAhCa{);Wwk@2U|&j)I~`1HD0?pMh6wHCHhXSh zJ4NEe@4E*xlqXIS8#7?#T7@lw`S-%TjPJRdY}^j4QItEZIr?VL^HU*J#cd}^ttCw> zNh!k<%`Gpk?!8p1a?Rg};R+|j*BCnK9uCv*k`x^)ym#o#rEnW0ZUkPmYEt*;z0}bv zHdW7n-ThcpGTqDI>G_p6cSXm7S_o>vMLj2Y2BlAq>YPEaV-{R*wVye-@3Qgn49J{% zIIL33KDShdV_6*M7h7)WnHrihcKUC(W4hzS0AAJXbHbr2d{mu!Y^Nd zTYO{LKsY*nvB2J@9d5Oof7s}Q@}q!TzVkM}m)vPH@r8|jHDTpBEgdG|11+x34|X&> zQmcz@>VI&7mFWIF_InDY{&C>Wu*;&8htw(3s#4_MEaTd_16S@HW&>9$Qjr-!%7T+t0Z5cL>!OeQHpPYgz1eyY%Yu$@~8Y0Hi=$ztlU0%I+I> zMJRH3s|cHTb*DYQaZ124CL`9Yq)x{sMiYtAkX8M(oS-_o!Gi9z|H#WRo>x z?BJ;NrjT51qJ`~O4bILnOo|Y^8kof!3{^5n1;R+!YKrDGI4C)y;@b_BHKnCltc{sQ z>LVNO26Sk4jxkdz;E9D35KlTt?*$JV5MsJWu{5hb;vu?4zSo4a^b6s9xnS+U$Zw#3L4dI+C* z-$lU?71`R8b~;vt*Ou{sqmkaToa}5GR~afcX(yD$9ze}7ONy6rIK$8aTTW(N5Jg_{ z!l=PHs;T9#-Ni^J9WzK|tuE0~NZw@x?xtsGG+0+E%A*yJd1$Tkt^+B^t;rX6N})QC z4Gd>Dt7=zwS7J+(msJWqII3-_+&(iSanqXGy|qh}Jbc5YbF)OyIabLYwUdpCTAjy% z)@@r?iaE=e6P97rSF(*cA^C;Jp#iJFd@bUsJU2Y9kck#Sh6cJA{vh4o>S;9Ot-7u} zrK(jqRgxvGPjX9`%t^I@+30FNwl5}3ggjRb;MbOWNoRSPlW;t?YjXF*5ZgSFyjaiO zAa$nYWp+JH@?$VOaX-Flj;9H^w*{9Wc+Fs1Ym!T6e>5t0!ZHJKntiv4TUN6Okuu4T zm;qH8U!gBe%DRQS`D^8XxIJr|)^4vLjbk{+eYM+K#(61x@%$1b5Y#g6TTW=A$wYWw>n0V)_3Vd-zq*%$E7H)&qxtZd5 zo!wZj{-9EE>;d-Hnnsn538qPPBe+CO<*G!q8%9N4g5Eg!1DaMDRx4@RWiODcS_9<+ znn1&L+B^M=U}_AkYRR62OX8oMcp_haWBgrX4Um#j%cR*Vq^VgVK^s42$h45efWN zGR(OL8LGD$b3B`}X;EGk+Cenz#7gocjNY|PV_6xL^{Xn4ytdp_mX?3%YVY@RKphW+ z!$mNM%HZ+|uHo&!dYh0iu4hh_5yzJqBzg+$pdnyM9QN-{I&33CD_=4tAYl4bl0h61 z?~rA^D!sm>tiLG+n?9w6%K$|r*vO`pL~PD5J60UG(7H0BZfe!KnhVH^vxME!t5|5! zpmtNczcpHS5yjj<_Y=1q=cPjMyc{x-U0hmlms4Vc0cy?WiT0Ko(+WF^yC&KToD;`1 z&GMsdUqMn}s$qr=ccVj+J98W;0B-e(ec`L5UV)hUQucS)W);2DM&Ek?RmnAXR~rXP zV(6DfV4hwG^{LZLX<5KPN)AnD?Am&a#l4m8B;MUkRts8pI()&u)v?c|RA^Zn4l(Ui z;?RhK3~@~{W~cc@9kWw#A@WD2XfQ8fPv4#@SlM>~+1o|Hrk3u+;B>8xFnNg3O&DeA z$f;~1jNE{$m2S1tN2fs&h6oh(;*!;j<8_S$Td0~G?I)!}u*0Phx90V&?P}$kPbmk; zDro>o(RAk($EP|am_!2 z?NDX$oKh>}B$_S-o$9`nf>2MjW=}BP^0yTjxo(uMI~$EMC>b=O`4xEPu)|9J{{ULG zZv(l`IHJvl#hEX@ZfYMinKx9%1wFZ|5W;|b%TB@~idhLgs@!ortMgMHBvbpbQpC=q zr8Iz=D1>Jsta;2jRB6cOq*eSV&<2(#KJ^>M8=91Z(tt1srLYx+xp_D}DNK7?7^@(V zXgE11tviT-s*G_?*cD3!%}xTcPf@)F3(iN6)iu!3kQ;kfrQLj6HQ%xMV29HP1_Ku@YB|cd2O&GYUwaDK-(! z3o}M=IITGX&b)o$SrVfphj&_ri#ja}#kz)#A@W~(H&kwhyGPdk(F8X$HW|lHO7mSM zp%Fc<5h`p6kt|TU5biJQ_ysavvVV)YDe&}^{&Rwz=&En7CU_Z03CTx zgS=R}EOQB>6D`5X&N8*!TkGH2)`id*7_kwic_Org`BjOQtg)==P_oG^SB66m%1ZY+ zu730TPQejjm`ff(Ijh%FT-)2qv9Hcl^{fjCZcJq+$!~hqK5fwrSiNy~HS(!-+DB1U zz`+9stpzAmhCYUrsgRR^K9xvAP;S6HP(}^~DNIn@(-5PBQi*llM!?PnD$Fdg%o)cE+M{Bw?0RN|#L7yBMQGpansv3DaLwkD zPu8zn#RMv4=yoh1i+f9uN3CaH_>E#Noqg+_wZ4_tX_Vk}=Ckz)3tV;0HK@_0 z9;I=0bT*T;9OAn>4Qc+&TW$y4t`A3&C?+yY;a6@stM`5_(=4+Us zwfh#xO{AL8j{6UZ0ncjkIDA*AUurQTY>7*D=Dj;ezkz^DxlyG9B#tR5-GjNSCHqIW z%!z{bBBxbJKybw8wP$L&;#ga(@t~1h?G)IjnWTzTE9L(H7Ao)93l~~cvWJs)!R=Xa zPiLm9u34Fn%t);rX8QFWCA26(`|8z3X(MS?IU`>%jMX=$qa8EG@dt`A(>&J?syn^o zmRuU{C1}hGg}@wFA+ARn&P;4J`Ug=de(b^ zQr_KUWnSjDTf@1u^FZg)mrl}J+UGJMI2i)A1kWMZ&H(=Crjv`*3K}(n8A3Ac+uZl9 z`>53r#JDVb`cvTYk_C~*J?aVBCuZBmI(yKXPT+F$>CH96bC3_=O}^8Bi2`ul@mKF< z`zvhD=RJpNq}MV?!ZrKmskWFEZLA&zCpc5o)1K1SFwBES%neC2%<{ zH%@s@EmqX>SNfK-wx))lpakz&}c-QY%D&=5;*+;w2&gMtfHy zK7o903(Fe-J*(C3wKVxmm?YIH^#uUs7isTVCmrkum^F}%G7rouF&hKW*P!{DM&i#J z25L_;O;#Vej|aUw!?+w8+RFF}gp*PYKIkx06Lald$ka60gFzs_OAwZrXBcT+`VBQLX}>Y6raZ zs&654)|geg;-^xqPV*MYG{m1_^`R}Meu{8vM6z~#9ti7B%wij-ZU+?_tj^c~^H-9{ zFHxG%g4<}u%4s1OX=LojJXN8l723z2YS>F$H#IWGg|?p5?1JWeR!fd5c;R?*p?EcW z%M15CK+hE2+*5`~_Mk+uq-XCOQ(J@b zoQjJ(jw)TrE5_id>0O_Pb-yA+ns>m5EIrL)S!n)s$CB=` z^TR(C(cEbEY$KX7R4Sdwo#{GvL36B!#TFMx|GbKEkR*lE28y#yx8$dwJuTmu^$B#dIxjk|&jI5q*`7Vny#2UWdOA;|aha9n| za6Yt=hQ~DiC>Y5TaH^8PNaC*%P_q!&0)R8gk`wZhC2WkMTCRk&& zR{J8ea4SKg<22D39O9E0+Y1f12;PS^*4-CNNZvDm!wrhW)72hO9N|Y=>P@5<7Gfym z#~fye+WLW?Hlsb=jCPKr%T7TRqi=OA5yni-PjsY<+h-6X>BK! z=55N?N#TpdYk_W%E>BiIm2EWG%pOW8+I#v{n;j`^-P#uz`g21`tpsUa!Mve}0k`ja zilCZdg6*B3D%{qJNX+XJ^Yo=9 zrJSYMjeF46gSk<(m$im!hi%+g2XKP%T2X>F!kRLK43r41&)#~1nE-7Zx zH0$WjQC3e;UVm+@-$lSAin^Mfr6ZS2sP?FHZ=mdIY919#VI&~tvS#po$}aSNM{iot zit7GQtnd2Pg5JDK^S+_cX; zWf=gIwL)QCNIV?AxiMk|=KN%YN=cI;~T1zJff>q-Ai@ z6PlX&#E?c%ayr)Mh^{R)3(Q9f#gV%TI#(4v+oPDbO1bx|&?LGAII(V)IZjIH=@hj|k0$rXo)?vRnNbrN zrj^_?wtyPoPk?yENeYLJ&68yH8!*P=KFxAY;;@X%!9$=9{aS3Q}HQt;)%nA*x?oy;(#cB>kFz2>{8+uY9V z&@7np&qION6``bQ32&@yk#j+Q?>!11y85a49COakXNw0ZGGtG`?vc#A?B^cC|9XHy=Y) z;Nnb5? z@y>gickUaNk1dLj0qKfvaf)94f`Ub(wI*{wJ*WY5pFv1KCp3L&q51ZpVC%Z05#efL zbDC>l4A25c0eo@AIKo41BZ`*N@JE5L5D!CJCgR5Fe&)$>=}S(dxn9prhFFf=ZX=4* zwX&ID2+PyGEro{GA3hVQYTA5BM<}IjG5svkxZFcV=GN~)rQ&N3}p;eH@CP!atd^cY)50lMUV2dGc$JVR)s9)qv zpL(W~0!d?2m&Pk4q1|%K!{#2Ht6mufe7Q%;-N$Nm#1Z+6ENXK904bq9j0{$eQu`3$ zO)U1avIG+C_Z@1}?1JUiZU8)UQrmr+dCH`U%D;ODr46L@5njp(Eq-ZRfVLTQ~@*G`_ziOhDFb< zI{?Vt;~EtTm~~+#_4eMlqh1t)e5{zm2mp%qHb z{!1yh#gN0erwJvv+0-wG-KIWDvPd}twRCz^EpudGxx%(S@%5$f-RwH1o^BaJ%)cwI ztyzo6itEZrK-_+4o3(1}z0w)Y>e5SUB?$;+#?_NNt#S=#DLc64y&uFHWWQi}V8e6B zu0eGB#*LZKsb4`^sOlFo1)Br&ny`r#yk(9FsO~N9gv`5%1B#m4PH7wWg}#*1Y@%&8 zadVQ#7^`zz#=pKDMN(@Qw`{?mdbJ3Yp;tNfr(#F>*4tMn1l5?84>_qN)7DKsIZ`q5 z^&+z8isN_g+^C~e2BO4_ygwAzfiMAV)*Hij2riPG`c;ctw-E-HCG*~u`+=z}%nB$U zFSR~cBa$##vsL1`5-xUwQwe&IDc?W{;g6#ZM@oCXIPFmzk-KS6#-#h}oxD>_Lri&n z#+y5+j~(gK$W|A0`IvUBvntIV{rHnLGIk3NU_*s-$)zSmkwf?CO%f_knOi5VD7aZK z7r4(AH$@t$9BzXnCZ;Hgy*Q|&WiPaFY126Sf~=DpQkk>m+owJ8RMSzK)DlL#C9g>AiRCCY|iEW^(5C&G&CR+61ru?g>Y*xU(x>D>j^*t zHUpmZ*{!5DntM(>w;RJobJQBb)-IOdE#t8W9kzfts&?oj4^On4OtWS4n-P)Ht<2ti z@_8iVq4%qCL~cRBR`uqV!6aQq81^YvCw5n#!jR01$RT}MZ7iUWN+aG$IWm49g;5t~ z$%{II!?RaJ)7xk|h`&hQ;^owNa`sI9@6C9u4SFIj?e8H~@@}Kt>QCHk{ol(KGSO%> zPh~&$l;Q~4nnoXWdSbRT#j~_%V4dz3IZ4YZXKgz{mcvfdMb)s`arYcJ{XwjOs@&=~ z#%U!-RAQk}U~TPLscWH4Xm7f#vM_}h^YY+!s`4vD*kvJytt=P#h%^jQ{{V{ztzQv| z?EcXrnOyUeTd7>nR8m`uoRAOR;DJ!euPmQ49p067ZlXc*#(?9WTCXM0XWol~ywRX; zIy7Wm?(P(J27xk+r_Hn0ppxq&p*iNGRxideLIU#<5NCzR2akGvq%+*3IAeq(jQZ6r zj8h=~6?V!;i{=CgKZb?|+Qk}#hC{wt8)*sJu=PuCvfVU0aq2Tqw~^+$jZ#i>7_sA; z*VN#*c?!+<*XAU3=A}DsJAo)|Ki-G#iz0RB#1kMAESN<15B#4We};BO?Nb<+$Kg%RN8swz9NjnUHP>26DaX zh!s{4g|We-j8j+>(vP}2(g5B?1-E@DR4%)QF-edp0YU|<7SpUw8IvQps{7ZPYREk^ zK#q~Mrno*;s%!_0RhQJov?_<@o4E9>IPc-Mw*(+EFG`@gv}N1PMrZalH|jH^ywyaf z4*NhAQp&~e46~`+gQwQH%~w%HbukR&b6OUDB70juCKM-f_4J|2%q&^Evb?jIOus61 zz%>ic6w>)oaz|5Cr@FnnjgjVJr`EH~H%lkUE$b?&ONi zP$uTb%-N_HS;7s`;fG);H=rJgqccq!88|Ib3~I`h0UwzZf7!m zpoQ)3YYyJ``y|O42HxFksnlXym>|4I!x8fg_pUQtTZt}7LIax9NygwuABg3a`QW?T z9BkXO-1}EXXH|^(H!7`+D~7K-lHOAw^Kx6(hgpHx9fJC1vT3}#6-I|cgClfIxdk>+to%oJ}t0apwb z@sFNveE_OAQ7o~vYFi93ns3NO8Fd61Y<#)u4LSu`S2)OOsG2JoK$2qUrz4JQMWe!O zUNO@Yn$QuGd1r=bo0V>Um5D91Xu|_1y>*1EHW+d{)-}vZWh}pR)^U24`-YWsEwqEm zEWX3Nbzj*tYIhOd0#S(i9M>^tz6*qsJO&*zT`jJ$b!yq4<{2FNR?5qv2CB?f6t=lS zWDSNsqLJmbj#7dnw;1VEr6x;%}40T^W)qi*zEML7exhWSGyZO {3a!$*D|>{ww@IVu)yiIJ^F~XcE$dwd zi0E|4QCM&1BfU&~%6k{X9+VuKIn4y)nxP4b8L+(#87H=AB$0{$g`{4>s!0A@{HvVR zsgRYZS2!7<3d`m`dsH$;%Gqve^L_5TR1iuf*bY>Bny}i1wdJ_BWOa-Z_|E{=Rj!kN6lU6JSdH6&$9k=GVS4kh2+*Rr z8*|9^G&r;?9UYZ~vfg{>c2w3bEcX1ch?D>@2g);9_ZlqMwu0nH zww5rsLS;Y1+;QvDyIp#~NqIfx#6UwBR#Ln)t^L~Nd_{MEsoCk)hT1!LBDxM@L*E?! zL)M+Vfin+VPqHoK0(jf(Fvl23{$f?WV%hetDlJk-^&Qs4h26$`itVl}rhQ*gh86R7 z+9Mo#4a4%sdh=dlbI;7Jjw=bpZfKp)PPADP2--!#Wd^n1P?PNO+r=iu3ywOP+y*H1W)Kb52(9~45x!9R|WFhxFX9buq1M{jaB#pOpvTuw4 z%ugpAel>JOZv~tQ8_6Q51UWvH42f@XaU+8!Fk1$Q>$y@^C`+B;o2fN&INn$SZz?+u z%zbLL&AX_>?g5vt_*7!*MKPy8DEq>xkZRe9ZG8BqI~AiU%6pG$ulA21d6w>1n#`M8 z4$Bnst0--$&p%4Ih^6p;>WsB3>ODwhmCo!SJC8L@?@*Rar{&`~s}jhuE=lX@S&=wR z#d*t?#Zazj>DO}D-bZx_Dso5QMk$vuDMz@64H$i`j?8;jIeE3FD+y?lKfDWf?OjKO zG&Y1>!!_&$b_tziZJ@WHJ;nSwA4Im*QZz>L%Aw4=Yns)xNpzM{(yYk;0LGmusNHA= zNdR%qYBzF`otaC?sgh$lxaHF{8mHj#6!=IKPM9yvyz0 zT*6ESBeAZBP1gm?(D~N{GXDSvYFy23OGinnwj~Tk?DNv9CX*%H(MaWi&O)~s6%MC= za~#PuAxftt8pylxUG>b&!GeyQ3|6vsNs7m!SxAZgg8LiH;YNAQa~k%Gc-IYaYbkw) zYXMx$`sA*EK9`pcQg9`i|9| zjCVCWOb=Sl!$iJdw|2k|hl;3WSADoRt8EZ;-JU8iiyWCi4G94{>I|Aw*!dB%Gl>m|VLp{D1+wAx9Nr$6JvgnmPI2K3d9w8sg~_X)B&< zcSq17ywu#>B+Fss5{`re=5dARuX?D*xQ%az=j|RMnps1+iUwpryRdRTW1g7xHGjrh zO{`jcQ;4=-%KO=p7sF@x3F+&}uYuc$Bb-;X~;f?iIP~5EB^o&O6o6sE2>GXs4D|YdvgB( z>W&gOJDryuurtMD>l$tCl#xSv(JYLmytHl1pd4iVdwSG0rPz$<7?k->Pg++}*exes z!lDqE*yB9Yn-M54(E88=%c`#nO^#-gL-K83dX_caoQw|DA&!A% z^r%@mp;r{F7aTb2RWGf+(HT+0b~R{>u;!5CIHBq^bM2+coEc3;CXpj_dg(TU+Ly~( zIbs(%mCu2+%mP+VH0<9cqz zXSSf$vTWK^14{PXEpunf-NB@g?8X=Y+}26fRh$5=x7WRA++G%E!Eu)3nodE=ZCqSJ zAr2pM=a4HBOP{mdt6NF@S>l}VR)ujhLy{YZa2^lm2S{Z!1eXR)M+f(!j5^K@2vW)ZKyCCyDM=(cDJp7@Vl)v@T`Sb?sW-XuQ?)S$x13 z0D6;LErrBOG&b^lyPPgZy?5~4+S`vKPqS%+O|{QM)06qp76yV_*gj(FZNHrD{M-Eh6vWD2d!kMeM zjQ;=;YPS}UJ0r&=kvLpB=LDZh`&_K5Fmi`#mI3I>hV`m2krDpdnBcSlT<)yZ$ zzigU#RFB=fCm4MH01^J}dZmwz?Cv!QwD(77YEA)FQgEU^oK_`v4iaLkM(5jBdI@MO4VHxeF`vJ z!E5F$3Qm8CxivaDMvOly+3(V(XbFp8%AZ_Oe*#@xNbn7imi^Xx?H;vzh|pQXzyr5D z00&C0uKS-OUNU^fHoyn z*jRew)~_|ZOK#>!vI<}Z9qSG`t>BSWqQ}jULg$}aT{Igzd&@iN=eLd-64j%L9b-n? zTO)(@_V=wVUrv2POr8r^qcTjY4&ZkyWHHC(isA0Bq`y>_S>!h;cno^`kIVF~yGGY< zE-d3vEne#c`I5fUeb4(;)a=tG?r@hDb6d*sx^kx(9V(-*K9%XyDqKT}ptnZGNk+|a z+K!7LxP7bw_1t(BqfvUHBb;t&QF+BaNWw^0a3FUS)%jaF;-QRrRV&kxOl8O2sXWed zqL7^ZX$*So@_kyO@69Gp(wqRI)&Sd;BBFTyU!_b27MqdYn2IF{$pDIuLgxUnIIHIe zII8S`_M*UNwrbXya+cR=B%qaEPAQUsoKvEXHI13qC$&pMF5ha8b8yBo%Ms`^RQL5Y z7TTRQk%lw?&^K1Lk*zMD=EE#4A~^YQdNojt3~)I0G}c@ndK$P`(>H5=(puiO*j;{7 ze}=eg%SkQn9Z<6>w+-uClDvr~Gm%u}mke`{#;Ls;V2&Uz?RSBOSMF?w4XXF z0?X<<)kZxSn=I)s6}GDbOE$>C!;*!#J&O7sl?RG+U$N>A@gb7dZim-Dczzk;y7Q%J z8l(Bvm$Az6uI4Cl*SjBIZnbf2B$ZO%QvT1EV{cWD3((|cbNY|VnsZ7?32CX~g3;~f z*hbOYHGk}KCEO?!vp5AuTJ80p3?=3L&B{K|sM9J&n@h-gCEy-+_}ucl zkMRzK99KZLEh@<^x-i_iG4M`(b6obO=GN{jn3u?4c1l;juO0jGN^e^*+|kiz2HCC% z25ADWe4ubgU!_`TCcn31_SkJKmt(D+a>)*(RS&H{oDpZLSB& zBW_)wH{Jk`z*j>c(61~&)nF{J4YJ72$QL;0EAsxF)l0aFmp&+l));inAjx$sOcp8S zKAJ(_@I|~^unQZ)~kT!hS z9i&!_=NTuyI^(4`#9kuPCxPsm)h}4bwPr#w3Bcul+V`lJEN#_+!}qFMgmOtFTNf*y zYVZVg=ApV0GRjT1Oc;53dSFlGZT?dbVrLE z0DD!C0j4xrqy}I1WS)Aw4#meK73jVV(RCYpZTA>`$IZrbxOF-9tQ`s^dxu8{%##ts z+2bd!e+ue!+qHtqYs9(z#$?&IstC}IN1@Fc4V@Lu*tJ*l%s~iNX?pEb*?sD~Xpeg| zz&fT9IUjEdy?FIG&1Fq}_K7suBe;$`IbxlB$%Y#PXjAp%{VQ5cP}#$IHODqqiRUV) z1I$#*ZuJ>Ea45Krf5ula!8Dqelw@dvCh@=npf$3zb|U0-;JhO6o~Jd1E^emlw#bu3I6S6 z-NW}>ecqLS%FFF;Dj^G&D8SS;a?zEJc&Q?NjktB}M{2QmYGI6R zdiAcWLonY&(!2nyWGi#r^!~NQt7LsEq|ustD^t_gx zcK~{=Iujs4`{5{3dt(_tt#gq0mey&6ep?LurL*7Gx;PyX50>SJc0KD=B-xwLxUSog z`g7FNujQ26!s+Gw`-T>vEXFd&AxC`hGgjUp+B#!kF_DvuWxv9#k}DV`xtbWb4AHzZ#A!XhhF)=X}qN&+AF- z2!Rt=pLut;P2&Lb^3^Bs&D*(YZxH9$^IN-#{5>iE05hoE!)WBvRvp;=GfIxnNt)6* zw@D*!EO_S>%PZG%_VW-OdBN%`NiJYWlgpL2ftK7j&$UrEHqzWIx!E@FzuAYq3c$@% zDMTv7d3|Vw(cu$rB}pI-ohqs{Qb=Q!xd1mGrDzLxEXc8JjySht7~-AoB#Q|w(?IP0 zV@K0*=qsMqCzLvEQZ~O;;9|G!A!#As_Oyl9=0!a7?^xQDGF?VnW3+L$(m~^&dZ?rq zF{aezm6vhO&y;=@)c9_DJJ^27G<&3uUo}Yn=>zm1*13hUjaYn#BbvQoX>kL*(aj>G zjifK7Oq1OR?C!OlKGI#V+cc;K9l_g=@v1l4_L+NbQLXL{M^O0YqB?f7bEGV`c4kz@ zMt_NYN3S)_L8xD8mlBEWqhimOj(EzP58)n_X-eG42hL+NH1-|4n<5}`7wJsobDCR#^$UuYDEz~LQ75h{%5W(G1d+1mii;1so&{fzVSnC(x6+}R z0*{uR?gK_?judsI53qMMg_j3)07OsaNV09tS0vIF9<w>{~=VF;x^D(>tm z1fEG7Aws9AJ!xzMQYU9@=9W~JHo|l6YS?unrVrML%ADlq)KHokoR{p+u;&BPrj+g} zChQDUlDrUS%ZzS4Y8PQPHZI;f(gLjR#GQnbRazqFn$_|b8D41;05Ze_O4l9ic5bme zNwYh#^O|gi69OWu2mT}4qHBWK&OV}=(+qKnWQ+EAfdrHJRFR^V*tzG9Y9L*+gkn8F zsh4uDHDQ9Q_JVC(u9gx3aoseZuBdGX)RzRDw$+_ zZo;ohsPb&w03VGl)ORF^rIt;}`~|U!uC_Zy?Vy(RG;7ot{rFt|bp*3Ck+LS-o-*L# zue6|8$7~T(YnCiJx7MnjPSMVK(WSu_VbOK{4^@p!HnKm;S>uy%+InD)mEKx2-7CG+ z)2S!qNe|t}UOi25b`oncGcYnHv=S4**wVH1gXupfblaWq;CMIKq|ZZt?27 zw{eb@qjrkA14W=&;*7}axe42_WD$??rd>zo#_bd~!PqL72Mx#Y^V5&gsQ&QuyHqVunVBHC+q|@d0gd?s1E9~XQPiwk zP_&6{qiG*^%(1Xw>@)LYsN?Xh*%EfPoZKk#_HeNt{)C^xx!qFQ=F&yFwN(?ujgT{z z1m`2szZy+y7b@w?ccIBRdt{m!oj1WK<;H$eq<>y3qJT*HTvt$KC%RQ0N>0dEapxb0 z6<*3)^|UfyH23qd+yu>^EpgA~)7q`sT-i?xc}!%MISg_~xE-uNpVV0&Bl1H>%}7S z-EA7x+6!32TFSAuK63nV@9kWkp{L1YVp$NWkTG9yt+f8UP~G;}NX;+k20iKP8#wt^ zhKr0~)Rxcocge|8FxcylY6e&Gm=nAZYG~sv!h&&HlEEGAui9adZcp{8vyt?w_AtD+ zUP^HNkD(L+NunU+j8nO-Jzgm7cG?Y|Y+*#34td5&C)b{xu~}IIiUcpU9Vr{HsH6wh zfsQn1jM4_9JWv36>S;b+-Dz`?NrGqq?Zp84&}h#z024|0QP5HjC<_kV>p-MEDGdNA z0Cg0HBv1}}&~rcvI2457=94%Xrn6yliU13I{?y>y)JWn?;9!b)>&+wtybSu&NCtt; z7|k#fpvV+wr#PcD1DXKaO?OR!_Q9rtq9%=e*XB2FJi+wlpw+Hy{LLoP+fqqk35jHk z5F_5}!SyN^Ca@+A=7Eot^PftsaE^b~ikEg4I(JO^Q058cC(m zZexLrD(K?k;5I%{&nNWvt}z?TK;ZQmsq-$zjbvp*$s4#dscs^2g|p95K-Pt3$;Cq= zpW!6>)`P{4SCLRWeR!h7DJW7rw;USIytndX%KgLAwVpbQXpu@dsc3gGmAS#`Q;DT$ z#C~g-$M9}AAB8sN*`a?`Jq<*yfl$!PD#;Oz*OUA}^sei}_t8%vjw^WsGGzY%5|TYD zf=9xYKXlhi;pY=wMP}Es*~Y45Y2(kx0MBnz#TT$x>Mmw>lHbaF2@cnckGq4z8ql_e z>S#f7g_rLTMgi@QUIlS5$*7yqqH{XBl03uJLlK`|ab0GErM9e(E#ph}OH;VV7apI8 z2alSn>^V_o4A$l`HWbK(;*n1PD|P(FDig3Ql3bxH&z1{g-}0^d-6qdY()UiqY~z)F z&vTOJ=Uv+lI`rym7EiO@!~2(M5=Lc0RD-u4dljPYLpjMcIh-4aY@ETlw&3*(#(l*o5{uQNtr%5g7bWDOd^`}_r5ev=t4I-#@RE!LHopL`aQcmS2k)vV$ zn={HT*^yKUK<>w;Po;0ibhBq{j5hF9o1gx@BHHQj+s2l=0{LUj$MCj!#Z-bZ>#E}c>=+PvQ#ktt+U{Bq^ zscrO|o12K?OLh)Oz-|ZPdk=chwzq;yWtwFw%16wn`HG^xCBD-dezAS=aYf~ z$mWFZft_}Z=OkB)eIT}m1PL0)A!F2i)*hbTwJxbHl(AW+p(VRP_fOgr=U@OGPo_n1 zx|faaFYFd2(rw_FlCLyuNGep0pTb5)4tT5~9LLLY(n+%b<+&sAp*t^h4?|rKN1ITR z&OJT+nGt4Lkd2&=xt_bfO2gKC6+N1XEZolt>KlT-+3!}oKdB2{Kgxxtj6or1GdMfB z`-Ai671zbA`KA#RDC*mOZQ!cldVUpgs^7#T%Hp%Qj#O8PqXWCCz^UHC?We*=y^lhi zgI%T7rTv7a&c^sXjIZ-s+(ab#k0)vB27T%_vx3%FCO@-H2n`}ZmOwk?Q*l?&4jxsO z7bz+bM_h`5+&dgsq<>}U`kR}pX`+@UUByFiNBcSEv9A0UqDGS0h>Y|?az#E=_F!?0 ztPU}Xft0b$cF<^-E=9H4BgND?7-3VVhV3D>WRM1I=XpHRw?+p6j(w>LuobziX(rZj zCjro7GKNF#K$>j$po%@V2YvQD@$W@72~*%&G{LFWt^z{#C--m>ZX!u zL=!7`UI4K`PQU3bZfoW(D)dN1ldJjRx7PNH+%U5?2M(q>) zyO?8~)v^FJlxWzXqQy|9lc#Rhwdnq5Nr(q)Uy1yo@PKt|tcuJ^uTgCta-m82a2UqRZT_U}vsB7k#7-|{HQ0)P~I(lEg5NDmYOPB@?i9MT-) z(v+MW^rX+d033tsMF97t&PGK50Q%5Q4_wl9qmU>N)?D%q4LBT8!8{5~b3nk5MJR2f z7^ao!PR?mq1}+N|j2d=vkVQ!tPzlF18SB=8fH?+%$9h6P3Q(T(#3KXOGzBGvAw1NI zMlqaH0k07tp0!=!U`NZ=tsa01Xk&~Lc%(BtMCe%Ok4k3VyjFn#_NXLKjnCbn3MM|{ zkS1#00bVgo3zazPD6m|v$;K)tOjVH#&7YGWT7jGeQdN%bI25g6++=CAvy==m-?dP5 z+sNcri%Q9E&*ZiV3&`YD4IWhbQW_|fNteoT_|{F$yFnpbWO~-riUQWf)tU; z_x}K8vHt)9H8NkzWg|-gEdr3Tm0sBdule+@L&Caq>e|{{M{42PjdylYS9^34d<@XKs1xKgSMAF8X5lpwzPq0Tq>=&dwA6RhC#yd+~ajc94l=y zO$x^|$MP6t+`EoBEyvyMT=tc!Mq!EJaIAn4uz#yR=QYzsth31rWdPVnk<%pi&U@CW z+haY;0>karws$s(90K5&zj^&L_!C;uTwXoP z{hC?X7c#1=5EpJT7uZxYNePlT*MFJ=o3A&gn9Y&5-)ZabPA>b2nHTn!*3jF1o;93+ zJk!{2A2%KFPvulDt>wDX<+Zi>8V3>wErM`*d-bl8>MLTX@UczDh*^0<>z?Bxt9_o{ zC-c=`XUt>s8*x?V{{XL5Zs4S0=@8F*ZqGcV%y78lf&K5st|JKM5zUOL=o_EHoju$z zYJX@%8{L9naJ?z8#*@ObOpU#cG1SqZjg2;AZ6KW@jmF)q9Bv+zTttTr^DtwP^Ji{* z)*tqITVFKgWb;%FgMR}a)vXM{e1Ug7$!N}e_l-9cTpSR+`NnBp%yzkDOwlmd?6AbS zW4nMbdFR%kdErDxcJyCzYA7!J=HKQ5_{J&&AKEUJ3#dfP6hFm^YQWBl3y~eH!ESI7 z@=Sb}9r0M#Zh=Yi!tOW&uN7`2iVdDxD|~>q?wxb_Rs{EuUynEKhDh`Nst6eE{7+xP zmotsR1L=`JPO~pvsmX+giOtkD5=ikf1OEl;pVusn&p>p zZfmi3o^3Lldyrq40Rt$u31~mM|CBoh;Q6Y z5@dgzWBF4pVvaz>O2ZrlZKv?yRH$ohT!Ktzb{ID_bu3tmG5li4Y+`ziWcAZe!i6inlQS9&$W;djHXE-(;oXw zTUVCh9c|b!Sqkjjy)pFv0PEE$JRA{(`gP=%uD?1a`~>=QifQ?Qn}4M0enh4@j#Dg9 z$S?=vT(+SN{0Na;!zTwInDT!*^gFq)ZEV8c*-5BJjlOJbyW1b$AJ(vK^#LE9bEj%i zS+c2-tnZKxK>LWt)~lw#af%Bz6-hprs@IS@Vnq$O4wbmRBinZOS2p&=TPK9uv{uC0 zewpH}H_foPXxxlsfHt2@R3{ZHH(~WW`s()L-qAwFV-g+-?^)2^Ow*0ekakM z1=8)za(Nc9GD#?0hGO2emlce0Tqe}d)~0mYNUM{}8cP{R=xZwC!Yg&&ft%Ykw&+ha zk9l}gDpZG4fu58tgoi>RyJNNXupA-nO}LI*W!b^aX4%V)MmHxM{uQlvdb^Q!HwUgi zl|)*-nOdE5N9S!+2R%3yGb=191k&U|xFYa+^{8y5xR&W)woJP>2RXp?_O9LhHDh%w z4p1T4h6%>vI+ANw7NVIqIA^?$H(2%-`tT{w6f?+^#=CLbc52?Kru~~xn$A=JM&BuJ zaywOV62&-JzkH*X2neW)zQA0wvX@ey?9ei9bR_O=pkuC0b{02m3Z?zL18RidZp;Zj ztO)0&bKV?Phgf#GnA*eT7U8&Ko)8npeznoEG;y`X)@O~SB_wwUdF{1%z!kI6BnG@z zsR(yN<+PuFDmMQBz(zl%RF={2VhYf#z`pIOpmaU4-iRehqY*5P14uwr^4W@ysjUd? zE#rjkQS%aWAUNIrdkW6h)CxEEDFuzp!V=;?ynqZU@Bzo;Tt2aTzEv_~$&mz&+~nu4 z9AnT|QK-Pt#TB)~d6Bc`Qp~`FIV?syj-J(0eKy+W-C%_yosq$cPM|NQ?40)FpTd+? z<#B995~Th^OyMJ$09hkKLI5y0_s8?9*Ap9C4?bkdLnv6|bB*jr-EF+__5CYLTC+D1 z{f;|~M-nhq18|YCg4ylp4mwvYABXPbju{^2*@CvvV{ObqKQZ0LNa@egpTzVTy`*ap z#uD0CWPvW`{{UMacQ>iz^ymn!y@ybhyykfd;00wNmmLLTX^~tDb7o>PZ7i|et^kA{ zPTq67JY?5Zm}7`9og8xg&=OVZ2kL6%t*Duau3OsLMwe|7o>e*h<89>hJbHin>XHXG zt2L5MGo7WO{`amsU|ty=Hndmz+a^%^ssd?t>a*mOyMoA*_5@$4`NTshlT2Y zI@OO4UIyGAMP=VX4b*RK@)cqVmN@5}b?sIzb%-A6t)vn+ zkh`V_3U?BqE_;qoO2yTzt!kT>p}Fb$r09dHS$V?_&DPM@hyIvh<70-+0n z#PgBUXahLgO_aV4C0&vI)oIUG~e4J(b4MmPM# zcLSwoy6kpLayX~Qaj|xvm}3~G$sv|TeuVa-z%cZnk|-zJ^QN7nEJy?1npXjHPTRV# z&!XyfmaTPhr_T%n1iXuk0&$W$8@H45E32!Vu^CuRc`+PE;H|0xuh|NgX>3ANO<+5BnkvO z6rR+hiVi3k(btMnaqme%Ime=m3PE7sj+8OaYDNHap0tI6A#SJAfsf~oX#nRGfRX{F z+s0{#3Bbm96oBN@op%2K61M>HOyjwr2V*q^?J}qljOVExMNidmDL~_%v<~HXT4znu zU_NSGV;JjBBA7iWu!%YPnh3z>tslk3D>iyk85<07%~y~Y>)NM&eZA^D6PgeZ?EsEx zOhh(8;-2GcV-)ZmlkVe=C;$zG8646P!QzMcM>KNCaB-hn0TgHRSZ!#@8R{wqWoT7Y zFUy>;@6UgwP9|v=fG31uyO@q-^?YEcKy-c)}`z;Tb)wsJ*52ojbuq1MEh7^ikJ{sG;;07Jk_Zp`%+|| z-VTI!sobMC%y^ky9hHYdrv{b@HovE(BPmwHRBkpf!OzmB!#b@4R*KF=jo?Y0+oX+z zPTP;JJu9k;P^z;94-|wVyZ-q>4sHSJ}L7 z$pehm+C7ZT=`W$ayjF4qN}T}P^I$I-=Oh46tu4wtkwoz!X$-8JV@?@%jl2&30M5CV zxHi_QH0~n|LpqLjliIA>YZKa%HePvdj+>%6*`2>WJx6b?4&aH6i-%@th-G4|t-}I6 zzZ!yN4I3#RCvnr>v|j$&2v-)fB&D;wuI6BSoOG=ByS9$tq`q$UK=I)T4s(oqel#Wx z5v|Roe;?!%${Kbm1WOQ zZ)(zvTtcEZSB_EIVYQX0wV6**Zl%;xNO#-YNEnnVCOc#}?dieqOt$+rpKMa%>N{yl zl?Veel|6CC`ORu1bIS~EG;x;CB@`9oCkOqmI(;fREhf6S2Hg}H6MHBG>;PvxWYe)X zCAWK+VM(CP)=pKT0m_Vj66ZZC(q3v8Z8Tfs`y7tW_OPLi*F8Z)&I#v?)@{Xx#0%%{ zmjfl2?jHSW)wZQ~YS71bbZ&y=hPWPX(mDW7%%FDKXtlQkIW=pzrEZ_8zUX=d?9 zaI@V?!*pOQaG>Be&T^x_B$8?;vL9@}t|j#}hLR@VViR zF`wyK8s3&|bhf&&(&n1tI7&w!naq6s?Y!>CwQSrm-4)c(i2^nm!{L{x;{fNM=T;)P zB5@u@cLf}uUZSZ=&iw>=o|SQL;yEqsCi9F|=w5Lh zr#5b9jFerQA{$gMUPt3qd^M-tKC>P5n=Z~$K_if?z3>NGRVAhRfgg+GytmWsQa7B$ z)3KOtBr@^0Bc}&7mf65Lz!lu;TAU#o(rGQ$I9-)Df*JP^3t;+JBXbmMGpb1&V4RrT zFzeWgr)HOMD#N*>c4pzadsc12TS&~Uxu2$ML|nwBl#nnv2D4ed(<5wEP`sMYWHeKn z&0*n_8R<*qxm})TZuIF1k$7UodsAX1n}sF1Vx^RJaY(vU={E4M8x7fRzN7K4Qqpeh z-pwwoqrsR53nGwyxIFf+GPl#^xwo3;I8~s4{`G>4C$X;LO+p(vtq50-&Qla7+-^m} zg5bAa;~ZwXAHbsbBdJ%#C1Ub^(G1c=C6R;16-yGM5g+&^y|B(Sme3^Pv4Clhi*aIJ7%}-qFC3> zc$AO}Nh3EPXO4cHRu}eE@?Nof`-F@S-9?1PcN6LNeX~!kgj~10XJy@%k~ZAM$KK-@ z-Tr=+Y-%kzf#F1CRVrg~mimSr1uea-0t`3K`%dsK#&Mo1)z!Vo0QtxfQL>DAQ<1>Vb4Bd50|qvR{iDCRis~b}L*=Omal0W{ zavMBz+L24$HLrcRwYfB@Hr(FHns*Afz%-NrjkzVMYv-`wu%%<6;VHn z<-z(KbNJRSsdp1YE+mb%gYy!4`*GW@2UA;;T-)7RPPUQ5G@DP#2GSefJdXbWm1Rq# zJK6_nx6EW*piQgjRDX11CZl(DHgjexi7pmst)+HRxlBsI2OUZ2oEoopV=~Szyolwv zb;%}8$B;*3`qxXVXf}5i%9c?_ZF36=Ba#@|oqyQ*c{_&(KDCkklZ?Ex1pzQvs9mE! z?RFJfF&R2Wp{Z%t3268kax$eX5C)t+h&$4cwQ(**<3TZkKK+>;X`SGU;AuFG_g-@6GqIt z5N0sSy!^lu-_X?^KGNagI4E8`Z*YE9UD*0@yYi<`Err6rp9S0{ROQNr2u?QTKKJB1 z*H7X7BGTtb7rKKYEHbi)(1nW}ZOQA7axuv{rxdK)0vl@!X$)rP$q*>n!5?{eVY@v( zwT(WBV6JFb=*gnS}lRwTQU8?D^xMe@>VC0V7 z^IX=E3TXEljqLK2w@>tSWnx{i^A#BBpO-$v(u?b;9Npr>Z1O?&38R^QQf?b*U!srd zO;YQ}O4(>Hyi0#Q`7poyIz(a2QZjtrSvluCk4%AF?Dm(MV?#91PiZFL@fRByFys;3 z9`)7e9w54rzR`cIs(~PkT1N^r>=b7zKZRJHSbuktDw5McENOb&8t>cTy3?4Fo1|dd z$pqkn56s6u-K{GtAMA^91?|K>OeNxiY2{diQFh+n#gmoe*NRI^4IJuoUqB#^TXt`q z*9?TN(805UpL^Jx8k+J8Y2udVQ3EV-w2JDhE1k`Ow<7}}b?Mg~DK?hCVd@7&h6wF1 zFYFlVG9OF36S#;F3 zdxJXKNb)!zEHQ0G9f0kM&Tz=(PUR|aRFCIVuIvS7DmWQ5n~P)mQJk7wFkIl^_of18 znlnHEQk))?0B00uG+=vDa3~lNmdWkXml-`NJ$lokPcg!;Q^g?w8NsDy951CTUs3L`F#sS`v&ONkMPv?LHxNID zpux>;X+C1vv)n=Uoi*8#Se1tHfHKS6e-HcJJJ1F$O&v08uDI~*BIiw-)cM+f*@Ux7 zoRYfz#8aDwBcR+K>)( zJ5_-BTcuY6@FJ#e2NcX_mdCvb0Kv)RQ@61@N%>cS?OhsJeV4=YLh!k}nlChMft-MO zQIYbDuHL;jX1Upv!b-}F2skU;8g>8xoQ!cus1Y`bQ83$$n5HzN=|~E*>G4Y`lYEHn z$tu;6CYO9xM2J3N*ivX_QZvOOvYo7_AdbeL z=DQM05>F(bT5F1CaZodI%T!1iWCLug#y^0rF`R#U-nv+ION)YHjeX!E68);$3%oNBmj=gE8VKbnL(Qc!P*p`!aIQ8_aEfvH>c``I$51jNQ zcU~$BONizkZL{S!U=%ew%P5F5uqr=_wpM0j(3K1&iN03dxk2gp)KfA;8E}S2W*w?q za8-})b0ZEp+B2RHzaP%8NgmD1EL(rsJ*c&U{R*+%<|m8nNh5KI5Ae6K{A)Ntek6I5 zAgBdZdSvGzxZ<}kkz2(aX)tFz?q%}8{&}X|%-VIryIG0T8S={!o;_(bbTsT<(+q74 zQwX2!O(2b+h<()?PT}7<$*FJcE$*+Qxv{s9t<Z~pmqb_GGr+(cwMiYqfF5Z=MtU4@Ii_D~ zFuE*&7Pi>2?p4n0edF)RG_QRJV*Sp$s@+&MoVQ+NZH$>v7Qp0u-D@jL)UNFHRk;(n zy0%7*9!!-~VTMtj*zNgIYdk8ZQErok2VzZ-xc>laABPp5^F7J?ksxLL>Y|XphD}A2 zRvGA;U&I|jZCUjw9ffU2soY=2_nsYHKU4%WyiYHcBA@P+ zRaV{XK^5eYUEZWm0*HzZS+G}d6x)f@a&C1}JM=mBpdEy^S-4 zPc!&_^7PMaf-g+s5EvqI*yNb z;s%#V)ovrWkd$vK-J^1cA9Rt1IqXN$x+RMBwCO~w*B4RB$IP56xJJNj!x+fpoL8G$ z>M?2l7q@F@1a~)avb4qd`^8Tf&3v)Lyex5Bru ztL5Nr=z8X-c@%p@7Iy68sa9&%p zy~3F-;{Yt5bo~g=>*-gZmfCYFr^5K!{6Gn@8xTACh*T4!(l=l-eEihAI$on zrAMOZaQJ2kBbxT=?s=spB%WNI+fEgK08i&qURWJ!-|W*dWG=avao&J`5})DfN4R56 z#dB!%5o#dN*ufhxa}(_uQUb5}{OZ1=tlHRGI!|`C_ZuH`$i7s7{osqcjGu2>ex#FJ z!xgQxWCbb%?!X&*I9w09GlAUVxZPXAI#3BLlQd6;^M%B%aO?&!**)0w{AwNK?a_@L zFNgI7zukD@+C@O}$F~7nu^8h&m^G(y37vvOHptPqfum#lv_Hkq@e|o{II7wP}k{Hwg@+wH|r#ah`^7QomYj=D$+nP&+2O&U+NfiJ6(65t zW{q?Gq4s-i%+UFgua-aBRlo;?*S!xP({^%k)a zvC@T%R%S_B5gGD|@y9~lalzzu{OfiLdwFDg>q#MZlGZro@{~WAgK=NtJqsQ(1#s8* z`qN%t#WKLsL%k#rM5+Wh3^(;BC%&J`=C~|w$vh6?mVE<$GO@)!M3(A8Iez+ z<29!x&DD?EB9`tMte{qk2`7!2?ioiaG0PHn5PB%bO6M*`&Y14w!`LmavwAn z7z)jwn;rS>=~K(%NG~*keH^YmfDJy+;Wh$>E>Hvl&1Z~3ObCbK}`qu{f zrkrkKyM_W~z?2x&L*+^6F~=UO>-DQm;%W6Oi)eL8?xfRFWqD&sShEn@x96_xL&Evyx_$?01t}Zpz9m0xJ^Sg;M*7$3n)hnJ;dyZSEtswT1;$cY-u6A#4xgAO+jg z1B2I^EbIj&wX=rRbQn~;d(cfj?VvX7fW&gy?xO%?WcH}E%_B}Tz2uhDTTKPy2$E7t zY*4shzU+T8G19sTVoBpd%@lG>e|i&X1+A9c@}8-o;#j3i~3b3-2?LbHDn z zMp?6hInFw8c*ivkmfEJ9e=VQcp65?lTWd}i&yFxSC)bXPo|x%d+I^8MS}4{lc>+Go zp zV-l&(2vsL}%9X`4P}g*wK1o>JqOz6^_9&g*dH^>s1%VxN$4YUqipEb4YF9{S zcZ}Rho_fb5gvlX1lE)ZR&hMDkt^WXpVLF8VeZ=bb3oN({vM^v*mt))J{V+Hc(^)5& z=_<9zU8J&Z4y;ehmtrzAj^A1m(%^ZvtES1Ngxxe!w2`x>+~b3{@+b4=x?NvM(ll)h zm$gGQ&=0dqAU|lDP@w0o5i)te0~~j)cswB^YZkhl?DEUBtdd3{0Tn{9LG%~{Z{kfe zT(;6KFXg?vhT?mLXyO+C05HjB0Rry8WhJ@C6x-6kaMwC)*47r+gojC^^B5;4F~$oH zz~_wAQCse?p6WtEs|e!9@kmJE`)$p793CUnR?8Of&XLR=7AB2V75NOV6}TTRG3i`Q zm1s&;YM^4hNb#21fN0`Sujx+xN*P_jV zn~XI8k(#@z>5}Psz0HJD@@kBz8FMRZ$t7D923m#ULEXBzW9}1CD!h(z=fa zS!r5=UM8khL@ygec?{+LUfhrh;{*G~qSEyJPV+&PqqZ{ILU+mvsUvPm;AeK=2{;{( zeATIRX4BFQktn~N&_n~3^2r&&?Z_b0upCYv3uqqX6lYB2jk~!jNFJM&hu~Hxk0kAdNPxY8wIilkDM(R2rP7_wz5)}Gesl$=$oE2 zIm?gk?&O2~Ag|)~B&jWyvohYzzWE*JJ0gd2!y5Djm9d4|zb_rCq~005d&OzykN2`{ z-eo{px?w?Vx8YhhNp%c~{iCT_!gsq#Z7LrwX6P98Q0x?O*!$+Nbp(!E2>0CCxQ~mJkOB5=rkGc&k zF~)LO6$b-%rA@|EC{=MyE3r!BsHxm8cszTGVIcL-6j%s0jt{>|g_8q;&q422o1ViM zsLGx_deUPmN);C_cj}H==kTN#K0yTzSG_+`fvqkT(OD8y+<=}y^r+#r-xlZNJt`SPE+a{k&WHX4 zxa3tbD^4&X7#%7Lt9^`FKJTRs0bbgFv)nAZcWtC=hA)tD=qZjdQJRsPx>mDuXofV} zW8IPu>s_6ep>1;z7SczP_mR|={{TMq=8;NcJa_9_7WXq-xRM!J9~j)HwKUq;OC|yuFQ0oYy8u!< zITU73yyG}H&q4J1){10u*wlF+$z_a2&PN%LdUdDUT#K2AR}x6PovshJuS&1JRWg3@ zT^oUz!NDW^^{Ws_VLBQbtqZF2pY`5NW zD2jA-Wq9&9&)p-8ueDXXx|(;)A&Edtd&WU0`cq)kBYSo?hD&L;9yaeEfdi!vQKIei z@yu3P8!pUw01dv}bmpRXreZ{L`FvrAW*&#yp}C$JSA0Kxa0_hbIpqCnR-QDrXJWg! z029|fhu7Me6tw7MgHn!Zp;-hw!z%sRI3uUyS&(Vw8-`|?*@T%RI8MHmw`q$)$oD1N z?(N4bo|{0ZXY-8G_XKhW@lk8sE>R$xi67)J#z@3}#KS#lu_Q%^JsTyvQbBmP4>F=S z*!XS39QqG$@u@mO0+7v~fpJNoxmnlCd?fNltr=GR<^>Y4o>?XG(Ftxmvyk7PPo+wz z85prpdUVKUrah?=e(jeVh~=@7?Neg96-W&1ra~O`0lIz_ZVRheEx?u7FkdevPp@25 za@$UrW-YDTqg^?mIU4bge5}a<}Dfx&bmd6#%80E3lKYFhsu-rsyes4~Hty#7) zUFuer)5g&((xh(5*-7dtb2#Y1qjSXig|LFw+fUS9?$R{e(KK5`ZI6=#=Xc7-za1-z zyLm2uhiZ=1(d+POc6ywa`g_h{P0CW@GOUfb{Ji!0`qmPEt1NM!LsvbL1Zn6NmeE_h zqfqj0!NH87`J+8f@9A5x#%*LV$rMsa_m7t+ZrDD(>km!Tbog$d7m&;?f&J;X+va1r z8R~0b>9+C96~&pjmd<7K5$3|lx6t68Y9f}zvQ1XzJKJcX4{^3BH!qom$j7y2?L7hF zrk#+v!K z>U=w*+fSy!cm1KhSq56lHbE-<#D5Xz*MZ)p`?5gKYK9yiJ(5E^a>e?Q{uOZ|n{fFz zAmlN|2Rz_+`qQSed1O;0jsQD=Rw_Rc-lu0o$!3gg{tz~w%Cu}mA8v$Pi$oJV*xprQ zB>+FgyQgohG)5yWBo4bt+z21V{c6HV^J6&!j3!9wK#P2Z{Jui{;yKT-qy`JbM9gj^ zd@0Fsg6!VE=hmc|Czddl*K=F$aUwW5D}jUa5JxqqHL9~01D`?E`&3dRZHr`}sXIhO za9cPXe=2F{BB8fSiJXMzj*JxGLm}cHHt6 z9o4W``5JD!qBgMhvm+IfD9l?`_Ng5~Z&FS;th-wsGJDVML2Myw=VT>h+PTi(yxyU5 zK*|19zYWZ<6I;n`ECjYNA1D3=$MZBe=$}xziDhMPrr&S=%Yqw}%D^LDhd6AU<&NC+ zt((uYT||?rN9BXOxBxi-bqCWG&fKJES}E=4l_PbQJSYk<*=ZD?rg-cs-kq!1>+5}{ zwb`9yjz_}{lCTHnRTB<|Vyhq0$;DD$E0Z=~92{{Tdb?CUe3Dhgnq$G<+O zCaRc)#o>$X4GUpQDyG+5f9$v)PJL>%g`;WqjtLr;Hwc@3S-(%HmY%4tqO#tCT1Z)^^PhBbS`+F4aweN6<` zGb=f^V#LQIC|v#A0oOd9wXXVJoME|kwvWvpBR1lZqY<3r@fpVzg<)}Z3`=dPTg;wI ze6t2JL+OPBpyRGEb50s2kef84ZF0sq?dZ*NL=uDh&vqHZoKSZuvG6I_Eo&>rUEvfyQ{-L%!4|`%_H~g_J+Kx3?kOHbCWy0s|g7Zk!Ia!<$Qs zDK^fmt>}B#xoZ|MYr0{E10rp8l_TL<8A&{^PI5;)bDUR^>k)mPM}gy!R@J|Hu9z{o z;HmvN?^v~FQe`Lsa1UG`I$#KiSglVcZVWjO6+cdg^WT7_GGrv+7dH z(B)!|;yBnOtib;O7BP{JyZY9tJ;cjD@RDm9-TYS8tqrrxHUV!K+^QEYL2T{7Qhr=x z99M7TtTsvZiI!(7nQO_1?o?;y`H0IL=W+b&jJDGx(O%iiwr>^X$CA=Yt^2G7J%}9O z52>!MODjnrKiXC)C7UX>yU!pv`6$3Cs6wdysc``wI%Xe{Cq25e(% zxA8CgK4n{udn?ELJcjDp;EWM6uEUO*+DHT2nuAZ%?U5E6cpcCk#hM|7U+-@mb?hqo z+r##y{{Tr=+BZH5?ZFxE$faZjYpp(6rWS7|#hOpw?LRv%-(SEFdY@CiKvLQWCAuYc zGC{&F203hX?fLrFjQ3EhG#1wA!4#u!TnFQz^c;?BG4)H!FDlinqPN`Gc_Mak^ilpl zN>{j8uMUeNT^U3sIMoAfRN^+s0Y|Pn{cA!?e6soRM;V2SsgZ*?z|TMq4hDZZ!qj|6 zs#!Q|cw)MUzwZEqGo)j$M!;^pdUvbdYwEguR}T)|&8i$J{yVIU~2(HDlX4{TgfaQ1t zJoc_8eeIcD=1wgJiG*cyy2mZ<_;TorrEV6yCH%SW}b zxHoaobM>k>+IFFRtz6G==4cp`Zi;f4#|P7oUt(x`SVbqgn&U~gyu7f3QJC&&g~D=M zaSYslx(Cv=v-g1;KHyoy9J92l zqBL1exFGHW0E~0S2h0A^^rg5XD_zmy)^)Kgx^>;d+@G^XAShGhT$WbgfC)YR@oMEY z%`Pi#NJV`$<>!c8PGT-c7yQE3cNvJ9GWuRA(Lk02-9~48!vY+s1KMEuyiK)V_YMU#Z7T(HNKT&0ke^=W|5_ksun1qVOKtxCy!BC&W{_Iq(?_ySr}lCsH)Wn z=!N7DB1ay%s7y%O1KTVs>Gy+V5yAW`R0wrDceO4R#A=ZFQKgvGIgA>|p%ovn^J3aAHd#~kEUi;GpXw|HeT+li!TmSDU2+}KqEIB$}<^qTVa z{M!j+YbTXNsV>lbp~G&#=m94lp-JRWz)eF_zJhserLvCNcZdmM?YkJ>sy^u>w>;CE*|NeNzcw?ll_nedti@TIFF?#G54G{oIN{ z5Q;P}GWo-j6Or}JXozEn4czYF0BUXN8*FQ5DyQzYI6dl|WA~F5l^xbkxxgbMA#qk? zm`nS(wzCtpkOP!Hl)}Y%nc8zbt4$2p{or64IWf~C{F;XQQMI~UTtzC(4kL73s2e>< ztLwZgPOBMJ9Ot0V0+{!VS72R@z~E!^qQlsaON!>wXy>!o^P$|rMcliGUK<{iNeuH$ z=`%EJ$8nA)$mgEl%e7y%ibi5`NpsWJG)rvDGkK*HC}D@-=M<1=W|(8QKe-2Vhb4*5 z><{KCj>mRD$?876s_xkeuHCX@Azbi%DviRFkRtAHo173h{Aq}W2{jp*3#GP29AOD! zed=aXT{eKqq@J`#w|C0JC#5aZ{hHB>GNTL=lY&RJ05{A?r1GR$V|IP3^8BqpK4$3> zV#uJT-oS&&6;3TZB#t?*A$cTkycxJb`TZ&HYYaB~nL`NjLP&UCeZ6Scpbfl}=3#Q? zZ!#qS$(0Gu1RnfxP{|YlH$k|CZM#T4KN`JhBTsK@=SIh7S7}@#sLnsap}dCZTeq6< zK?`z>2ssriR#G!(#7vQ|Jag?qhDXGaj12KwR`&MS*0F^#n}>D@{5b>s=7{3*m7$cT z^*=BNfj}94Py{~q_qrAAzEYQp;AT$v0H2ho$vouN zmY1Vw+D-CY+ge;ha>&v7Qc1Xh+=G+P6_0tXUrxf_5%wfG-z&(j?eyb{G=;ddQE~Q? z(|zS|``gih?kb+vZs6JO`_$wVK7))BJ#kTzS3Y7#9^;Rb)B027l~JXagZBmFOUf}StYxMVyz@@8S;@wh2y_L&(zT*cUKPy zi3n9Ygj*OR0T{+A^X^S$4EhxMF~w3G4Y)yONQR<}%7K1ZclAH?~N{PoF?4+N^66%^VoQ zl4V!RDu927^XpLkuwRh2@D^18x0c(3!TFCnkGDZjh(zb?=?uykW!yPXr~R%veR|YU z$#zkt`(4?}lA)sj5IGBge=}9q;5z!+3;Q(Fra&OP4x6?OiaN0ew|;mz=}<BHa4+U^BWl5&u>A_D}wOerYwAC2Cs5J zEi*Lb2HfrhU=K{TaCkVb?^LwZyhQ75HP@XBO0!D><#u6P199UxIl$ol70Jn>YE~Da z?yL*JZs{bfsx~sN7=lMYX;PDn?;-R(ZthDl$tRl3T@La1~0i3G!_p!bd$hJXb5G>Ir)`slGq6{^ZGTx90NGqXO8$UQ2ZQ#=2h# zu&}i-PXS4r_rgfx7$`DB9OHH{-<4WcyA_V8NpBb`&9}<{u|Q70Azv&>2lC|pBC3hS z(iV==IKqtilWxcyljwi_b&S^xCR=-Ji-;tWAvCf|i`APyH)U>fjC7`IdirQSN9cqLOLRAlQD) z$c(DYaq}+jy#D|fPAa3&O=@-RaSW>Q2&3||yXImRHT7Zrd*ZO%L3Z%lz3gb&^IRijw$Ga+@;hXT=q!BMj`%P9*J2o$;oG=pxj!~f zt|_%;s1WVAV=S`41Tm@>G6+;KTzt)(n)9n^HEmy0yYqB=hSTmGv2z~MvGiPL8-@b) z_ciKQ69a7COn+*Ot0^grmg&Jgy7TQ^oy2on&2f1g_jfi0{MC)72QI&IGn0YOIO)f= zMwh!lQ?T%TwUm=vEzjHRWp%++3l=+=^2%nvW=)eyx0RAXV~JS)Aje%?qxbw znwIeHZI@CEyn}HWUJe(IGtWOt#cf^)HCVKyRc4WY(q z?!-|q3n@E8J~OwGyFGAv@m$>cb+)JWa`MRx5Y8UxBXK&Dlf6mC?b!LdjQ6bsyEj^g zl=jZT)bGe##AIxrnY|ToeuVnwi+2M>;(J#O0$4(^zUdxBX$vo47_i%s?NpFz4S6-d z5Swj_Bz|ON8NOg2aL&~q?J()kQ`1*6w5ZGG!EWL?NFiT)F5<-Y;ejjssGmZP`rmA~ zKWMYNP=RGE!I8Nc8TK3>T3(|lU+IZF^BZX)ySSL;QWsKH268|QdV#Yf^!#c&n@g;) zyn$NW4X~qc3d}La2LnE}yXLl?mHy5*89#Q}01U$z8+hA=7|%{Boz1Q8sb?_>C5&tP z&nu*^tIkVjfu7yZ(vTdUs}Y0Gwn&#Z7amMy1~Q#rIQ3Gi^=9;`H2pSBUg6`1K2j2r zOO6-U{Eu3`ulC6xxwg|(Mm~7tx!lOCP?OMrFai42&koom4)Q|=!Xh!qS^Zg94ZX3q z)b*y2$Le}E{XW*#2+X$o_B74Mml}Y>Z*Q4}XK8OdvoDh8?6&T)Nf8)*$L8}h*9RN{ z!1~u!CA7CwW=Pqs)+SsR<8eJdUcJp?+@JWJqFIRdq+h>S{#PoXpk_GdjQu|vLNflw z_BS)ilMl4CpCjzrc9L+I?Z;8v^IaSlQR)YMs~<4PLq>j6&)rl06_t3i!*O{sOsR8k zBvYeEn}nm~1P^k+k9>Mo<%OhTCU}t+HXrLpM;zskzdqe)=rfa=N7B4O4c? znFs-65abMj+iCp$w$%35pKOuJ+?%-*ek24LHY*X91Gl4kRGuZi(`~Mr*8bM&?Z?dk z45+HS4=qnnIL|fCS$}4oO4fUHc&_2R+-+rIRHo84hVF7l98j1YoJ@jf(p1App|Omr zw{`mDarCL7Pb|oh%+I{JJC7;5BWmZjKmNLlPq|Bw%e|zU5<|l&+aP5meBP&={*`K2 z*X+|$^Gs=R#wMfm^KLXs;5na zfgC8QZ**EnCRsxje@8#uBcmGJxYTth6%frNvRwu!h>AP=J$Y4MR?PrnE||}469N&#v`!S{I)>Z5eVeUVZ6&g2 zvR7@mUo1w=pKx(o(9e5$IJb*YxM?>2{#fK7qlfu|us?MD-+Wb6lK%ivF+=wIpm!;E zGRc4$nL>=7paGuX;(!}rrfN~5BvC(*OX zPbxo=tj`YUQR=b%zWzy8$)sTtmSrX(@`mK_2@UU7?lr=njhZdC(U8rte)|%*=b~f+ ze($|d)+4$Wkm`03yz#>;8Ig}HjG+i+9=!AT)d@8jw1$FXEV5~r1I@Q&RWEMu^C?XB zJmh+V>B<9MX!Q1(rc1eGx_E*~Btgu9Fc`OA?``>b>N{0^%WUw=HPm*q4clH>5;csg zlY-og<8Rf89O9s{BK{kj((*zKOzS(&e5{r`)b~;uyXP(Uc&v?L=Hc|)dwYkRPl((g z#>R{(3R@q>x%yy^^jr;wzO%Nyn$+7*1in;~3z=>Lp$fPoBj!IYS8jcAS=yedHJVuq zduv}JSl#0LL}qe7g@W%m0|aLtzM`-%FRo`0t-%hk9u%ngc)>pB1HEWz+Lf=^{?Dmy zWQZ1vIq{p=rOR3mgc}ezbm~J;8 zG(5_~7~^Jn>zw5A#wwH-GS0gqjF7BYf!3~DPa1A#mM}xBd2E}r*DJL3?UCNEcXCGF zZ7ut^!sHUT+DHc@`DUtso>*AjvR64j<5p%KVSgvf*gp2~aDVuozolgfrj8soL;d5w zs``CJSgmtfJF=G0!tU$kj6eeO)RXgd_Z0B&s9bEC@nd-aRGxpi(~ObR>sZjv*Gw($ z95`cyWZm?p0Bj2WZ+Vd82!%GP1@5c?%n7=FePubmpDzGn9af3s-x< z%`|6pXFEf$L7%ClT}w@x>32Tj=M2m^QU?S2S4FBdta^LPaIf~IapYwN#0RcD{{T*v z%fSt-m+gIX=7(o1`?$9z;)LYmk%9+6deUmj=nIDGJDZ5nyvTgHdB^c_(;lD7qn_$E z)Z#{wW`WoO6yOi9zH1^Y%ll87;Iwi~RaxXrsOQ(vWbs{ArKF|4h=3|I(%rMHkR8h` zTNfOjTWJT=8@c9^cb6#ak(;Pq+{rhS$^iI=FnKCEXX~HRqP>D-w=vBzkRs#{YL3r! zz5H!+He$g6oc+xAt2&T@=RjsYQxF-qe5yyKMfKFH5ZSyoa7}X%MVrjpfOS)}X9vGr z)BsLY{JagS-ncE^v~DeQ+j7FzGj{CXxp_a-t`Gut0#7ISnDoa3rBIz-DGv1eXdhpySLqKU$>8XxoXSl)9vf?7w&lis~qgVWunq^sGC1W{*)NPUbIyz;VrL z>N=WRe(gqPJ-sTE*23j1$8%^DByz^2bY7G~?QwPGMQ{o0w5?)WDb!t&b}1h-5;-+y zV=aaJQdz8tDmY|dy$I=3d#fO^r*k zg$VhiD%c!uPC0L2ItrB$tp^dAE(+s&0ksJ^=O>ouoK`fqHq%~YTexVK1*B45Far8_uXUD4e5%*pG%3g z54GLQ$SuiZB#hV~f1Z>|w(nzu;phF%5 zw}0iW*o#YbhQIkcKYxHHEmF}kku@qj>Xo0q>q zRTfFDAcs=^&n0F%WoJieBW39t@&Hl();rSMMg^9d;@?qwC>~J%0P7+wgzY@8M^5}= zpt^?c9UkHXB0c20R@_@{jl+FMC5Oz}_Q~y8S9h}!sL6P--6h4!IYJi$1Szp()`y3Nv<|4Mic_f z6k&lQ1-pO+Q?&72q!ao2m4us}e4UU#?%b8eSbLmt(ynTY4zYI+_IxGQ-5N)0f#ji5 zM+l65Y?K_*FKGrPD zBg>WZ8M)i^Bh$8PLsHf6*TQoF3(0jPsS^mBVycAR0^{zeR>$UZ>r-}%uvpQ$)MvKT zH^XA%rop(j^gQJKexB7VT5Okc$t~XEB$UeI%qk<}sK;KwS2v?cc=u^}G|H;jR4;?` zam!-^Xc-u-L#$jIqZgYho(ijUY-A}UcgAWK%KMG0Y-rthYH1-=irQ63q{^{iclvDv z4y2B!*0i*54>hwxb8v(0N?9gGlXz}P8-KfwKN4z&opCmybRDH)@$GdUdwt&HuhP0j zwYwJTkwRhvmYdAlK=u`)c0{q#TFj6X5x&5<1EMMKjP}hFNVbZRTRI0U$nJS1hgJIe z(0r{v(rIG{dM4&9<&V?bo&{60c$(7K-UwrwRbZ-EGX%#x{{WNu(zu9aic5JshzvIN z%IZSMILYLJ>G^t94MN6w>?60ioDm_)Fc^sHU$6{W*ij$=;NTFAUOiJ5h`4`l}mifAVz3FlRoDAD}A-Lob;?ra6)rfWr_5n?4v1saTw?>wY=DJ-natGgS9?t_Ed zlk}<*YSBVc9Y*9O&d)L-C(9njW+1zD;{;=`=T)^&653h#s(7!ArM|qmxV*JxL`7#KZn&C7Z1t8P!L+aBNVCeh#WgFb7&masB_zhOn{0bqNE^H)Y z8mW0AQvJ#pW5J*v)wqLY6ynYPJp+g-f21UDQvA1TMwQ5KrB1A9o*uAtK{ZZ72W zB)p&P6CSB4v=CasVUyqo-kx^*!>>a~vp) z@xZV9$u|ZcmI3$Xqwxyf&7?(RqsCyE=m8u)97cR?zyV>^CO z z^{en-!*?-A@-o`WhFBKs3UZPmU}8^azO1dqGv{urXtFz^6{5YBP6 ze&Y2#dx6hB!lapFdCW0M=P}`<ZDv#Vfaj7r^V*$pdV}t}$Q6$7 zkmM4+oxZ-5uq-Z&!-qu*z@Nlp@cn6|BDu7ZN#&6BxWhA()qnsf9S;LO)wYErjK&|Z z$n_oS7?h%KRNNF~f&Jd}$(}iv<&5SWk5K78P>wtH8o$7&m9+?7+iaX3{Yr{l@|>ppC(q`nL1C%%2? zupWP7J6LP6!K+@!ZRI#*Nl=87l_UoI_&j8FuRxk%I>~HhEw!I(^v-zi(~hIPa9%N% z2G(s>Yc-N+VcbNmxg}RWD95+W!S7Zsbv9>0GrTc0^H`P+Bvm_EPfgo#yBuU1 zR~?Smd7sTlmD!b)h?tcHO8`0g=hwAGJ(}E>w22$#eUfEH+0=4CIp71|zSHYeJTUPZcR;JO`IN1=k;VEV1#yL^)^gQ~WYf>w_Yl8Ovds~QbC6YH06~JA<$Uf_Q!21UG z6vRVqrCY&y6T*tJe9=QQN=sYr7|vLC&$m76FG{q6J5<)1&f+~#jK7(*6SHh=I`zl# z6Vwj1Z&Y{-l9EQ(UQmiaC)!|EDoVFsy@mN(9AIaYT=Q61TU}2IBPFxk7j*M{(vhBW z$OaL&o(HnfI}=<@ZKil;ODh{;6lL+d%~$?1z;xVp=Nyt3j(GyNJUyqS+FdM`b43Jb zqwN#384iA6%h=%m02giC)lUxCpAZ9ab7b+_`I05L+&0J*;Kh!ola_9T?)0vkS=Zl6 zx=3TvZYDt*G?y{1BZwXV$9@M)j@@Z&D+{hMi%*Uq86t~l^3#=(vI4H|!#|;F^J(u> zP`b2fCfuz4ZNVcuK;^URz-~PNu7>XJ;XsB7rSmP|5;!0;d5!lJ43WZ)ImTK2E1q8_ zK(_lLwoAp5H7ZL0N-Dc0 zxTK_G=Zu06{0k@PO)(80Y0xz>9OyjEqe~j}I61-h9N_!cN9I`HTbax)G)|;jJeh5+ zj&svLHs6%{_2AUFwDAU?4eQKYSh~q7qWP0NDi~vEzRsHeFiFv#bMMFBDi6uk7N6~W2xK*?0@lh%?*gE1hG%bq=HB4CW9 z@!O_qb_&bTVDU%@89dpU9zeX37qwTvwSCVYn51@eY=T8wly3(;SDL4BD?Do(Im))> zQ`09I{!{^*4ZYl#k;@~3iZUcqgO0eW*4jXdNoBbkl?f;#Vo2wYt#7K}9MlqCTSGUM zETT8YM(#65DjzduMdqJwvKixsVS-8W`5Eh;YOI7^UA%#Irp>CHaa{vKapposHsB%q ztUss!0A8|iqxWFr+v)yzq|zE@b1MbLFny}L@b5-tPze2M{mv0th^mVwN!yS;aZt)L z$I5DD+9OsCm&x2Q`F~oH?jEwJ0Z(DjRd~o9MM%ks+tB?DYFO&mOCk@nfWzf2Al-r6 z6^vikr7$qDoT~TPSU0OQ%>>wNDP^Mp+i+8wy>*t|8prbCP1AAr zGthz6am`I7?cJ@b86tqZN)+Z!qoCuhB(Yoh_%pOI%D}i^l;^8s(?8a&O>mOx4Lrsd zX9~!pXdAQ7%0HLVmcUc~+4E35mg?v*2RSNn!+k2Xvu%!M69#8uS~9$1r4>gIXHRd!}kW8I7j7*{@r!*`a98>Hh%P0C01Uyg3~WS4{?gv%I#` zE2hPj08*oAP)}Y}f%#9rdX5O8idAc7Wh?VL|Bc$r8yK zP~nV$h|hEG2OX-6@v=gpA>8L20qa7I!C~yR(kdb#Jqw zh{UavoYLILmp2bEJhx+7*WMYo)26m@aU0uBZ@+rDW0G(&=zWi4SUQv~ZE`|=#=%lM z(v9tOJ8D?9oY>v0tH}N_?OIQ(vD6WFq;TM!$DI0B40ErN+oV2cOk=2_pJI%Ty(uTy z4(`JK!%wibzq_=wy^2<5WLcC>#C*G0a#Uy3b6fVhg{97%E;RI>-9)6c7Z78~lL|uQ zaM{jBPCHkgXpd+lQ*h{^omNf#LT>q)bHO-0J*(BOqPj~oF={f#vc~b;$!#G~a0Wc4 z&wQx#{vs*Ucb6#*9VNrwNYJLAEvks+{p6b+pC9b*>FNBc#og=K+KY(9jc~k`ciXpj zxbA%_H@Ud81|+zWd8Wi_Nd=BU?dkq{Rz|(5zMo{jXiqZTq(~%1Cw9Z0H-C8Goc0x? zmWcs&{{UJ06e3oMk`SzO6oNLB9YXSW&IWz!g_Fb&WgWJmYm044YlT%s#GXW89N>33 zz#E5BO=~)8*EaW(+Fp6rg)tnk5JYU*-5Q_aP`CxZ0a@_qXg3q=9AU7|JU zK2!Q)oZX_(5gm5+=Hl|sJLsAjBoVg9_0IHkItN(o%ahJ(yo=O+1ST&Hrj;yL~ygJ{HN}YdB@B5anSMB zvGrdOM{B5mW!X&(*8X4bP()rh-*L%33qM@;!CL0LGjTNf<4-(}@!QAt*SKz|BVDQW z#{deY6~h*k$#n5~nKDcKPm|B*N^0y8);^&j#5N0UD#rPcatl<`J>~7nq;`0Vc2&9- z?ln)d>Gx6=Mcti*B8BAO zp53dY7l|#pv&JOP1d-o@NBQ-wHi0eNV&4XuXOrex@$#HuN1+vyc^$32(wKl``HX-b z7d-X#sbspGN9E0P8p>IVO5+#7oeqkf!atC03@1}Fsvja;bMzMKp=Okc;AEjOU5;eO+EE`IdmLx5`QG0Ig z4@zZ*i*;{z5VX6V$VNV48YmJR1^)nean}c$q_qQz@n*E%9lV1>f*9^B64p0K2@LW^ z%-B`_=w5KXy-jr%Hg^zs!ci^cLNja~(T+@xNZ22#W!skLA2JGOjJybSsVtmoA7*Pf z{>(Q=xm)MlSYw>@IsGdBooj!m8{vPb+}_XRgcmQTV`yc@Rr9sR2i-ojLe|%Dmo=$1 zQ*3OsiK4Wgd87L~qu??x%2;0y}M@ehlwJU_2!+Gg2}qR6*bw^5@%nS^E7 zsCjO7v25|$x(znez?s;q(f=;cqF#-WCdBl#?b1`y!1HlpMJTkCdu!$803!XCs@>wQZ_t*P;z}T zeY?~avl|jEGbB18Y&ulhJ+6-4|9@Oj(YSRaZW=UT@*!ex0Yl5 zKMG4$E{NoQ`y7GLdkSpRO0&j|uq2KXIl(#UNMz3MBK3%9iQhcstIqxivgdnTwK`1kw#`% znXz$l7k1)7!WQ~+aqEhMTCtegs>ZpND7MBi{`valMl5*zI zDu{mY9DU=C*u?`eMa*%u(7|$&%Ng4t2aEoll{?w-j+`d+okoz_05zp+QR<;=<>9@=Z(&P9!LKGTDFi_LLhl_86$TYCD;4Q zK!~*KIR5~)C!FcbhsyGu!vfPEF&J^!ugrPsM_T8#ts#Umc@1**(#o<*5j?dkyn;9N zBX0v}KQXSR+I4kM!Cv7d0|-e%%G@v%$EF8v zXc{S5>RN<)Wu={_ofW<0{$elL!+BQmsAgTvdE7$u^*>6>IwPj5bE@4Nf~IzFEXr`v z?Mx=zo>-OKJvUXIJH#oh-s09c;x9Wc)z0}5MoZ&3!1Nr|y-!fJu#QNr*={bQRWEG= zee9fo#dD4_--l`f=P3~u_hM~IX8W$DP?-}cW(Od5+zvZuZZTNCC2LjH40ckl+2i?k zD{k6wrL+Fl?b-GVTN8MXM~doYOF)JxCv{01d14iS-~v8oZk@Vx$*wl?;Vv&`nIaI0 z8$MWV+jt%Oou_o!Iq1N?Zmbue~5HX*#_6CHLFy!j{-kkdhWf zKf>7S>OBuNcf}e^EXk|bD@Sc`3piuD$!?n)-#drCJ?kApmfBU1)1`EJexo&|obk@A z@Y}M<_J1$T{pRHgFnHsW&-Zaa&Ra>ixYD&-ZA#8{S)+ZpkY!|6>WnkWumof2Pw{kz zQoFUiv9*~3#pOvJcZmLCLEXn2nY#>qO*-xf;=Z_!Hxfl0&apalbqsLWsgQiUIplIP zMS|oua5NUn8@}f1PuI0f%efc^U`W9utvep3k(eT7jxpYv&_be>P`M`q6o&w3zXFgI zl#tuH9*vHBQ;|;JoaYA_>6$_X<%d8|H7cBp^&P4ZHZX0+}i>0*lzsyi2On;DzV5FCD$GqGrlVYZ&DgU&rg z5zuX1$i4|ArY*aE>md*K#X}@`3cPOXiqg4LB1X;2s~+Cp=kuzPLPUqIc8$FBp^X(3 z{H&+FQJw~cv(;+bEQ6@P^%Uj{G({UA_Nf`1t=NpzH7hBJ$USN<^#dh}LaZ~=x`ey4 zzKR%;3<5O)fH~ec81*DqD66}^J?o{m(#D^9yQcFQa8$?rDK~9KmCcKJZRER(EjrFf zu6)gcCt}OBzfyCW)zi^6J6Yo~`H!^7l5F9My70c`f1WBSZ&g`miag05%!GWbxaX+% zrP1!L^tfY|Ynamgw!9@3f6AfB+pb5h+mv;IA-=e zt4C|-PNk@kE+l_G7L;Qk3@OOyLm%Z-Ws1%XPB|>vNZ$molLb}12kFnXT@ChmVn{#< zCoE(<$NvDJ{^s56;G359i!bzJgkAI`lor*%y}0Cr>No&pi*&(Wc3F8n>D9 zv@D>=$FHVO;n&)z3pwLq6c=Dfu`4pLCmyT(s#xZ_5`}e?UBkE&FD2o zu~*$0joKpU}>tw`+DT5W{i=yOlvFmO{D4)7z~V7MN%9S~u9Ght1d96%ke&)t0U&kUN?7kIjL%BB^&O+qN!w%}Fe26`HUzId_f zFWFJmE>y%x9I}JXDwXuGnk?8!X%*CWH_@5VQJliO`Izi|y=s-)+-n!Fdh!WwE?l<3 z);w?l`~du^+H~m*{$oZ8?IWI4{{WHG@~abCM`YIwuEAs*qL2<$f0IVd*lC>I+qW>a zym^OoyTa{e1Y_@e9&uTEjl0HqU zH{F&JXBhVBQk{xxQo0SN=NJ`bkC?HQUKHkm2RS4hu4&4_d0bKvtEbw_CEERx)$U=D z!X!~0?n&*EI6qqTsI0Cc@OxTc$Pu9rOKI{#Eg=>Ro2XKWE+2q`tfWdIT-o|;XeP6_{e=JbV6jt`p zyEI7>;4bWa?^Ewk!)NE}7SKB~q|z!gr#~q;C*H1{WjAhvwYM&%hO-2=vZal}*%nq@ z4a06n%v2w7*1J1x5_FkvFEx4QwzbQ^BT$7{?#HO(uN^a1^xY;MKKkZ)^;^}pRal`d zCek^`>FPTk=Cf|+Ns{K}?hUJ2zR0cqWywa|e$rZK0`CuiP zyv7VM`sXz5Lshp;4ttoPOKbDFe6WgyOaKJ!`^VIO2&pvHme*LgifN)#dZ_V6IoP0M zKkqkRsi-_$_SV)x=d^il05L}IDa?7!(a)wB;8u0Ew{awKt#NsIn`>hz^N!X}GlxC0 zd)H;8Tioda&d}IgM|m0m;34_TG08Y5IR|O!#bNk*)s3sl(ZMj0x=7`A z{>8WnYjkd7xKZZ9$+Khs01qRg^#qJ&qESg|20hn>5|a@033sH91b!EB=cOJ zxu;L0$d5G4ZLSpTWLyR#sKUWkq!?HrJ6`z^E?a5j2G5_UCpK{{Rk8Uv4VT*#;5G zb~|(Pobyn_bqvsjOl5vtk6O7C0|^Izn1iA8tMd~P6AkxBC$(tVT={F{$eZKN;F;)s zeJX;>BzPhm2R|@86~$eO`q(bnBPENBkOu?kYL{dqq0uL_p5|q5^s>Z`N3<1IZB-(Ng+^PQndaBlX38rs2JUzH5`&Z0q3C{l6$miFNDwV;!xt8M6 z(Ii|h=bIULzKmMvyJ)|i-P|9uQX}~S0NU?dFhDlJzu=n|WD>F**6xTY| zrE3s|drK*H{k}xY44tdxfxuj=a(iRelGF`b+pSS2S#7Lsnkf_+q`7Gs*MY!Q>F!D9 zpBAfm_N0#O2bx!y;Z2|@;F3;p*bI8qX7J0bc`93ESn|qB5di}XGIO~18ShMIl+IaM;Vm2jv zZ!Na+H-=?J`LYjOeMLiiHI1}E9#S4a2uIu;o(LgvmgIrQddy3y?cr%5i_E$I09zRI z3ac+0N&RZ2ot5nKPkkhYH+-mIGqE}ON9K9@RPF<0IS7Io>jCe9^y$l9P~f^embhMURlS?_$TDsk5V!D=B2vSe4BTDGSVxUZnmrUrbqQcDoNFf2u4;iVaq8vz?~ z#QjbI$9l!G)`Sv;p3MTqx(_{mZLQgH(*xz>)}uPp#d$5Oz!y}E#u_yQjnBv=AL0Nr zZS)!MObr_g@3$@VHnFoMzSM?ik6@X(-N!(n@I6IIZ*unfZq+-K%s|Nd#4d6^joaF$ zx78BwS!)=d%qH7=G5Jci;xYcu%5#r;o;VW11yuq^X(p93&z18GFSi&Ug#aXGNZ^c4 zq2}kz;}|4@*!tDT0szm@w|{ESg5GTIxMB-2Vu6VnC*G_+p$u{e@*TkBdQxDBCP}1! zIp%bkH!MRxDCdHGsrJwa`Dgjc?s7BN)84b=)#JRnhH$^Sc+2652d+6Dy+1m$734EH zT*#YqyfXdXdQ%G;4DtwD$>Fx`1Pqa$i`3Z%hCyp%Globdo~~ zUd*=cw#z9|nHY>?I2m2JZu?fCp3cirUp5lp%%k^cNJja#slmZwe)qr1X$;|Hl|p6# z<)_WCF9a_*T=yfnsA9Lgo6VQblX&@LlX=Pexdi7Yf~V8E5()rObE%wHK zJhD0>j`MC59$q>H~bk(4Lv8Lpp6E!;s0P+*;q}()L#ZDK>CdgNEnZ z^{#VQ@cq`8Jo)|Xakkcw1HmVc&x-4GO>jI>TiiT1Ze(95BK_oGIX%>MKBJoIE#tEB z3_E6QNfTx>yCp}UAH~N!)maN;#?7_DM$;IaCULm08T6=wl}*m1W75077hZ3Z`ycER zzmlbzILT0rvmuT%yKZyo$?054-D>ff0T~L-x9jv z2iAcCDO~f%TncGa^Hxve#?kbr@kmB!0jx3)dXRjqGuN=FTP4>y$Drv*Cf+09f-~}w z)1@FLnNIdE+yjod;-Y1EpK;x{1YqzfcY~1s04VGa@qH;;UGaqK6kvN$0;>a^yke0~ z^#>HpW<>xp;j(=gWBmSADP4*HqTn&24x~2+)~Puq8X^D+at_`>t8Xw!{ZFU8Q+XEV z;0TDwLM1ui5sVMchBPpScjGM0+rDbMrX*m)k;Pgds)hvnk=mdmIc#;oH6tj?9|5t) z13fC41xPiiaEQB`cI8&|#}!IwTae0fymRYFWm||u(U%$A4l!ATVzrd)V0weN&&+>|87whWVT#vlO3=w{lE_NOlHpX2GEX_B(xPYt zYWt-ur0*R7^w0UtTe~S0r1HjB&Q;yBxy5N6R;F$CEZD>3P4-)+MhzHfoOc+gA$Aw$Gd5!pZrrDW2Y!DVY{%_uCd6bUZrp<} z`BZi=UhHTjaV$%QS&(F}8QY4sgotOnjJaJ-XszXpkR)Yt%rn6_9+;(x((z-9?Y6HH z1}Mng$o?XJo$7Rs3^Gj$v6O!hY>XU?4}P507SmydS9tDL7i{eS^Vf`zN>1a@pUr!D z87-Mzb`9QOCP+I^o?ez4gPcR#;W^6nyx66K@){0tMOQ%CMn2^VH5m2+ZIAuRJPC9>qu1@Pv zx4hOZZLg#ABJ;L6MZ}mqV0(j#7TnziX*4j+YNj<-nDAr9dF($*sd;T@Dnc%7q)6S% zdC_std>_k$RhmPme&$E>HXH;5{{TNvZ+fLQ?9$8ov!H(-tpTGL2fLfzT?rCAk>jnRGTUvwt4Hm)rEyM*5Aa(--c zw!7~Re`ea*l}l;bOGuVgd}N%AoujpR>pNtC;g8-skG*xeM7oxxYc-3vl?y>7zS!~e zjirKnmS8_R-Vr|JKn=!f{gm;`G;qlvwHd=mbD#eJUuv3bTZ#D{R)MQ(P?_xS z?xkBe6U-lXfWI?gi1c17mDP2-&0>A`R}9JtF`;aT{Q&$crMh^o^quo8cJt34k@k(@ zS7;fN$+`wHPDI=FGijnQu zgzyic;~neE=2wm~R1$iQD{DdV96DBsZ)0$aDK`>4re!Q6LcJ3m2_vZHskI~)9jocd zXEeEa8rRJVk~i`7!20#9##kjArIDiy+cwOH=KOssO>@La;vHwsB4xIQH4sIR0=_gVveJRTh zCNf6ggZHu5H7de{JdK^ljFZ$=0aix2x-q&w)z8cyy-u(hRl~0JXUQkI{{SMRY2yf2 zIL2@dSB6WcVs`w5_ux`PxzXwVB5e}XTeKqL*_$CJ$qVi=S>7Sl+gjE>)UgO6TXcdb zSD;nN{Yw*38*?#G^CKO(2*1Pl)i+NrK^Tyh&Mm3^APR#iTk+cj2hPVS45B0^~93;Jt<>~*|4_m(0uG!e|?2M+s{8LYj>wd<+$^i z)DU-bjz013X(gt+iJI41z_VqylH5F`4S+BOxCbD8an`8CBiuyOtZ@h0NXwifWPso8 z4Dp}#O=%^IH*8l6^I-n~7JVu6!3DHYHRsHduu|wx0DUQ_JiNvu3YKXBJMeD z$P!8h8IXg}W9A@dj{e-#*|v`5A(55xa*@dS=j+~@*ckX-q=5aL$RlN7CFcVpZ_0b) z`O<8QNYiJM&IXoA)lwqTZ!Sn}xo=Ore+o;LX$mu3g%kYS7v_yZ=eO|Z`eW9auHF5E zOPcZKdt(Gb;igaEP&SeW`?&uAzN%Ow8uANyd^t2%&pZ}kGb6BN^CHgejLptBbO2=5 zO3C)MRIs$O7Et0}DkM4Uw|C9XQ%TcDi?%`gZwl5)J4olYwpb!a)#5|Ol{mi+Hwa%YM(+`ERW^S^X?3*^~QSt04m3jM-HWPHg~+uycTW;7~-zp*h=T+ zc1)bPEuGxeJKrT@%8xad_`aQLZU~_igk7tmGRjMC$KBd-{vV}mMC#HbJZ&Qr%BO1P zE_DQV=3q$l>?@$MiU7qxI=@{*U;vn+<(sAmriN?FavC+=hJN6Dxc1|va$1h1sZXt0 zm6d+UZzD(L82&@@^4b0`@inD?ugtfP6^*sAwU6ZUM^?vi_2-{z&W_|fH}^M0C6p1Q zuRC;&dComc5(Yh}3l|bW_IYk3!G$>6-d6tTteNJYP}Qy+lxBG28QN3~0K9qu#X$P} zcNZ|tY>W07RICY(RO2j3^*OC=Ic;9nOU8e*TNDB@A?3S%r=awpTL^B_(jD8F$aqkH zhmOCEV_s_4Hu5W}k)Uv_$sAzefx|zqLyUXXjZ0r!2qs6jjwt;2fs=P33w(f|bIX1L zxx4*xXO7A3WN7ZZk#g4vysG2)IN%yh*l8V@f3z~nw(lhNiJ-LlKXh;{WDnCHn%Mh zySPFP!){i_GJDc&JcweFNaWmvsm|<Hxasw*ofAy^c`YtN zsr$p%u&wDlD=4&Sp_6lN`C}?UG}eI5)_Ao402OKLJ)Ob(a9x5|1xX#zvFvM&)NR@u zi%Vs)Gp(fT*AYmYbDUr?r|2VA(=R;OVv}y|tsZ#IcHSV;W!5xDM`LYu^0AQy-PrF< zp=@zlUZ*9L(P`dPli5zjErfs+5!HC${p_EmU$)S+?Ke-lp5-1`5=@~$M#%pFxXKUB z*x>QTYUSURV{WMti|s6aW@h<^9I@;(SrJ~}6CK5+;Znh|fU*Y9_lKv}kQMb^Qrdkr z?5&U8mjS$@yP^jnN1(_Do@)d%NAqxV>s#{b5xgerR?~M{WJTv&ua)z49D16m7L1ym z=!zjC+0D4KvX$6_uQV_+WiUvEM@IX})DNXG5o1L}7)2NWR-__OnzbG=GZv)%>Y3vBx}9f86`RV>6Xzw zq>fT;g~@eryRb^(mA$z<)rn)?#TmA@V~# zCp|We$3EEa?Np?OWm#DeNf#S7oR$?~r)VXP*5X7tA1r|Ur1w2Dk8FN*WWgwpJEJ9# z<6-{*SJxk3T1nh(Sj4>qBmV%S+!!Q}IZ?{@$KY!?Y6{S^%abv{!06tlx;RT(+j-ZM zU@!ycZ@NI|^XpmHmNpuJKV+6Kv|x}CfZg?>pt+JIM>#``cB3GxBXl)XhqbsN)0N<< zt3v?`oOZw?twBb7@R;Nxq7|nq8;I3EjX~s|)tJgu^Tj5HIGnF0kQ!|&`%(jMNxhVh zvk94V_j%*;u7({ZOS`0Hl0|tT-5a9ka}$go>s)QB7PyROA28;<55r9~H!=OHbsIM< zC(YZlJdk~f9DAA*cM~wJ^!Wb65NyYJ46;P~I4hsy7Hp7mu>-4>$I@OcVPO}#$N51o`xK%3{;q1=#+&NG< zBL}uk3Kv4=9V~4Rn1{;7yz^dj2$d0f^v^tAKf?v*8F<#}z%rtguMYw2I&Xwa}6ndsQiLq>l8q z8PWMFZT#J!IXg%_MQmyM)#jOpf_D?g(55k*U> zARqp0(>R zK!D5wwm{S-`^^87cKmZ5R*S$WPiI7-A^KVHcV4jM!^JjCX%$ZDyH*6cS zYVMyCM{PX6Czc9i9;|a$Z7e6gTbt;%ua&hiyZgrgeJQ)Aa7>~0sphswWr8qQm3)wS z$6u%QG|BCJv$9W}y6}3{&8;=Oh@i8=8O}Ow>HO-NTim-Bn-Qt#JJPTfqlaog@UtDD zcLJ;=7bY^unKS7Riwy&&+X>w3i-KT?TgX zg=}+9GdW_iKHxeWb*jH^wr9D8$N_jpBdPEB(C#H-yY6)oq6BFAbC1H6N1PUP9L=?M z2Hk=?in;Pj1ZF8zte6-l>&-B2f-=ER%iVed{U|`$wGE5?k}owQN18(;76WjFRaJ56 zx!nH%-!)rZ)<&H#*|cj>XA>f%tjr{LV191B7d^eIkk)SGl4#xh!Q}|ihy-y^>ldqe zV+!MRv4R}qj=328=xwcpwPmdbTM+4brJcl^CXp5GE=n^zZO-)=?)(g%2Q`o+QG`{% z>BTW(WmAE*v&gBTnkeR9Ha+MLSwY6{Z1$}lvbc?}4o!It{{Y#c61}YKqTx}9`9aR? z_QiV!pJgDIFuVM!%!QEd!0F!~pS^KD81W1XV|65UHg?vKsA*O=eZf!4yW6iVj+N3$ zuEHP+TVg)~+_(+@00PD`KDDdtN!7;daDnaTR#`nTS0TClh{aX7v$nrc36!gW)W&e! z=Q#X2)om8#3%bB#VfVIVg1jCv_|`6~J>T1m5-rOqg;-p^0?7Fxm#-jW^{m>nu+7{3 zCgNoh36)wkc0(jbY7)Ghjyq$T$Z13oeZB4c+nGW`1Woe**kZ??y*hh#s9ZhNaa)<1 z-qtq-wh{!qXJZb#dUUF~m5lne?CN~$KeKL2CyGssvNjHO_X?*J2-fi9rNyPC{@5M1 zl4MA>?i}EXUlrL&s%ToIOu!|?46&8VKHPl6(+7dlqSC@z;t=6NfT|BnD)z0Ew2t{y z%!(srKPYpOeg6OoC$6Xw=lUmzG|T&IX(fti@0tRUw*?keTzP+x>Np=$TX**{Y1g`K ztZc?@OXMz25neRi_m5rXxepQP_8tiFEK@@jys^$*k``9o3ho~`9^b;J)Yb=!JXI%% zZX#G0x1*~#I7hg73Q`fND-|!X1v)taYM?BIv7**cowvR!Y>F&Hk4VH@~ zmXenGRMoT*zo}hKZ7YV)%)yN-zT=O-)j_YqKO+C;MD!bB3>nASyQ4drBg?s{$Y6_E42 zwbL7hmk=+K1YDA%?uE~6AJ&%iZN!b1C3i5!0{V~8esv@_OZGTq^EXXzbY;QnsK~3d zdi10+%eva;VCO9oqYlH4MnU9Y?KBJdK~FK-2;`koFrigfYV)}DJs0!$tPO7dKRPzE zg_S{(rFZ1Jv+f5S6aWAN+}6i|#ii}!iFFml<VoRCeAVvsk4~KTp`(!b2R5 z^KHl8Z%?IY>H3V<*Ani-1SMBs9IzQ9in|kbR?$2m_B_SrZHSDYlmc41ds}GZREeZA zFz+i5%QePH;`V!t?Q^&8Ma56y%iHUbaV5*Lg4>oe>GY>yvDx{ksM~Oh)rjtU8f1?0 zTedPY`qf(rWYq0l;Z+R3bzE~>LJ=LRN5@WTSd3=V1=LMCMvpXD-5JItTyyx=1p0In zNG*U6%_Lr8fI!d9jiCDX#dH^pH2aUVugbe{SkrE;qSPX5m|JqLM7wyyd(csp7MftS zn%SZf#K8QG^L+bChV~#6kF9BGcJF0lB-g)ZdxpY88b`D&WB3oYdH1Ta>ItXXq9mZY zag^kbaZpL(OCfy~oU(a{>ZDG;!pE`tQDJIx9xb@NlKm3qRPxk?{Sl# zTISbkCegyL9M`7Y=(=y(^L-f%+kBaW7LCdNA8*F6^`8qpy{v}D-KE~j!=Yu*Z>L%t z8OSRJ#y>ibYncc@4&zRhUfi;k!!HC=VtuZyv~kpR>MB?wNV3bwC*{b`ere~BR0YR= zIn5TuBbFtTa03B~_2aE6I9KC?`g7Kh#M!m}VVs;`98_!y5~O|8(3+5q>~kO^0Q1(E z&Aq`H&pmpM)BwpyX-fdu#xeNRWMwyRMfpmAd-GGt_rC51dgJuXMYTgqw35Y6z&XQb zjQf2(r~!kAW!!dWjmyhZj|GY6xh9fbvdZoxCw_V^4^Kh$qzuKFZRybk07rLi9H_y9 zgW0p~PJ;20IN_ZUV~Q|{S+Tvd&QGr++ln2Rp7`j0oka00>Om{l7$*SclS52>%#q#c z^2V^pjIO^dtT!uUi~u_v{{TA2`wH90k_cKk)q|lq1Kzi^`K@GBx71jyYqe6~u0k$H z%O`w#9@!OpQ-?;2PPBv1l53fpdbPZsVn#N%r%{oDDle&NUt^5at;G6z+y-a3haPK{ z#^ogAKTawgteXkpl!N%zmcc!&)`7o$GUa_Z>}sJ9wsy=;39x>@sGkQG=-rn2o&@IY3vZ!StQCadA22Q;W5AvpJc5vTZM|*v5HH;;e7o0p; zZa((kUtDInyZgJVsmtBZEX+=GBbLYKRQF(Dbf{0|SmJwjMQ_ z=Hl*asN?PObG0GK+gVBdi(yK=jT-aOPtfz2xiI9wiU@{JU z^Gl{)OMhj0ffsB$U6*nBlbmtvO;*$5yxDPg8t%3`q?91yCeL4Baw)mqH$-yTmapwd ziWF(uKr_{FD>8W0N?sXcKIe|rr3~^l`lB8l2mo{=*V?S6nv!pHU_y_WX9d*%0CZDN zpfW2v#9b;|WdbX*p*-AmO#H+B?02fk3@g1~%YQkrNn@V9YP2>Nx-a;-U+#_N4I6Ve%G4!r7+`Zh>OFWXIx|dYOa^HI#^i$1fTRp|ANU*{%=nnJs z{{SMIk|2^MScAp1#5b--u=cA>w9^L2?)PHD1B8p!@yY$8J01EF9z&vC^~6!5Lth1Gy)zHAPJHgk{S6q*Fe zw^m6kal9<}VB5>C0FipA^{#S4sq+FDH?4Nhr>)kVd2=1YgoaZaZs6`Gf<3WZcC3n* zoI?~}AmS|GpIUpwMxB})d&uSw^5+;IO6iqjlFH%a#_hi^uI|6n70g?xRY-y9fm*ws zJ7hC9)nmX1(wlk-lgOgx;wFhs^R*Nlet4%LhkIhbZmvQy$iWAT*3bLw|39#)bDu%RDQj@pv22K8*mtWi>BcjjGwodj7VT?nw(kULU4w7k z*n0Z+u5uZPji1TFuikg*>;S2Z*8%b_C%9%!ieKdmpHD&h)0boZxrg5w zCpE8gZ4{8Gkr#Jdj^h!;p6gI(%UI5xdkxMR?a_c_ z&qeMlc6p|=noU0P2o~08HnT|iNgbPdFlFN^HsF18S1m1Vn?_rVq|zA5+(3*Oo;GY` z1NT^Q`PHdxgIL_#d46S?4kC(0`Jx14KF6Nk^>#}}OJc8v5qvuww(k` zMgxUxoUq5%ha1n83g#rYGQo2s4z9As#a;bq69R$3$@Hy%A6QLc;<+T97881=;3~4P z`^A5Uj;d>%-c+(HtAf8P4xMW_%Ii_u`jrvw!@K(kGjo| z%B*R=E`!8ZFR_+vuTEg*vaIa+(S zw?|w)3hq$MFgo{I(k(q%Em-X#hsw;?Dhh$0m!{uLpVo;krx(&k_GFPuZBHqPG4qle zC#KvC)hn?r#+~J55#@DVl6>d}2^*KcC-KEp)1LavP-|Prnt0T0V`!K-9JWS%`u3?M z6Kb(M(#D1hiKB)^XM1-5MnNQQ_2Z90T@{YAXLqM8uuO_0j1pI$o6|nM>&ctNm#|)3 zJ>{!K_qIZrUZUr;67NewJX$+80<)o6X@DaBVGm7-D4EWO4=FiK#xL7Aoy06ij+PTjd zYnom6+H9;~Qo?bIjks_2u-{+nS;`!&Y1=~=OtFSLOPky2kUUA|BqZR;leI_u2 z&&MLh_em6JsL0Wr=j8|czV&MI>U+DFESzCL$3;~b;;28Ezy?2!E9x`Q{3+situ)K4 zo09|)M;vysM~}R!4(xmLTwjVV>~&3I>hjJPM3Pd<(se)KRQByax%a5dVCTIOmoo&c zR<({_Gw&{crAUPtFiHam^C^Oj}uT<+{E zxU>qK76^)Ea(a?|y4NdV9{DyV7DM+-99H-CeciRBmoF?Z+o}7;P)j#$ zpmI%8b|sedi|fnzzSS(NaFE2np&Tgfk@Y#LT3GJQ&C9y0MuW?XXA-eEDmx4entz2P zvC?6>pHWQM&WCapIhe0I7t`xpuA|}$ty@r8W__0Ee{}?72^#hQeJQXumh!-?q2(;e z8!(C``J4Tmo+-E5o|Pe3NxS<_@>$(glRb_`2;&u(`@3lI)6Cl;W7KiS^rj0xJT}Hw zWnY`-#tt+7ew2Wm%N*!Op1E_5Y6;S73k~fTB^0n;Kqs6N>+M%>bhvD+T1ckfZdGJw zJyn8@->D-5wktjx*`>6O7_KLl&0l|(TY%>|A%2AZW`Huy^e+$0LGy>t7&QAy?$YK7 zZBXu^Y#$E9?Zw=mw@nc}!p9&5?3wKmf<)rtt*yuU26XBb~W zU1x)4f_*13vEI#=2CC}9vC~YxZgR@|KY-PnI3tb{+ctTGmQ~<>wU;3YZ~$;y(=~oc znc@h8nm|>#(ycF0%M3B9u-Z;I&2twX5|-JS?WB%!tMg=Jlk4qWovf{F&7KhBK_B+S^HD^7;Ey%EW9&17rjI=+tKye$6bq3!?=%S5epN zRpF2eu@591GY#88A75&A2*9e+?K8yZ=1tpAd}Am70Ig8L7RJM$pLkW^jx*^_MKZP* zkL6S$c|AR_~D$mbnuKQjtZ zxMF&Brbu$cV~Nz0AP0Yr*rc>O^1t}$-!Uqnu&;PGJ(egj{gAX zr7)2q@wc8iqyqx1Z20AI@{d7E!~hDC7_lY2Jt+&Q86PvYep5*!g23P^oZ}w6exB3} zi*Qal0;d#?6^Y{~X6701QLx`23yd#Q=m(`S2?yR+@|rYIr3$0zAYm2v=f?QlxT*XdREDn?@iXkE+) z)~$J)aul9;KJ^JdE=C3j{Aq!ge2op`$bgt+bLErVR1@xoe=XFjuinpEzi|@D84Z)P zVL9o|Q@sV|cXl#LtjG~(Tb0B%9&Te7ql5i>BuD!p$VJ1KbadJ-Uk`ypK9VEc2^Fhf~O$Xs_FRIO0fg_y34x>d}QV7k0UDFa|_JF~~-)6%NJJK82VbSv}5 z-3?d~v~Vr7W=|p4%|&Dx`@PR@=N{FXF>yGP1EDRgm+z@z1p~)?ow64^CHdlk$=} z@mB5SMSQOXuzsKYZYp+GLm<+T9_R@H4cHEt#c18yM*F#Z?=M3q zl7n*+p}hrW9Y;EgNaFz$Y1%mY)%J=Tc^RJ$Fk2n-+NsUtw+K|T{{RT@S>EpDEs70w zc_i;~X^>dt;9%9uSX{*=#qa^!ouu>?V9OFon9${Lq+@_-`v8!ZMBSgg_p&LaWDZ>d z4MNQ#Wm2x&bHMwxpL1z-c{4m$D!J)_+O-nba~4RHsW>}$sZO0T{OW@k{vTSeFCm2;ts9MYTjtcYB(;mk5`pEb<7;C)eR-~My0LZI3yuvd z1&Yz51gvfMz(0Da%KL?pt$QqXx^>*v?;Ng@?N=FPI8fV%=Zt!-a`D6Vi(*QjH+kpO z_X4ayg`{jY7auD2H2Zrgtyx1kU{`wMxz9MEJ3FFAYF9JrdL5k745g0SMjL+UJDGX^ z0KSD|7*`FmfP3P#wHR(K&B{CNjJnC?1H_-*Vm-HiI?lF#FtW03EAtbcDtBWdwvKRs zxm*Pyd(igmL0~#{1B$#9WQ=o8SeH5OXs}$*h6Zf*9<|+gBU8NdO^PP;iFX!fKU{ly*9{fj_Nk~TbQb}Ba{=K3sPE_pU&gvUPTy9s zuv`0C<5;cYZ#1p|8?Z75ehBGB?6nn%b**YGXTwrMCBs~sR&ONAcAmU29^jmwJd zFHq{uC!7~?m(5v68-_o-?lHw-UkGDeo=a|ijyPVmAZ4U^`HAV%9VlVW-^!*85l(qM zxTn3G&c14(^7fOvJx)4(Y3u}1+p|W@GI7rpcGlH=^xk~NVhA09tGZQ;!4dqjQbqZG zrnhFYweY`&B)+}dCab9_OBjLw08oSPN`3t*wF{V6+JU|Dp&NXgak4d0fr`XpQ&1jp zD;{~;yzQ%+UEQs<;+yMtn%{O^6}FN-wWZ=H+g$UtD>&k}S6!kN8$@^*MF+Vf{Cm=p z(%_73SM0+{it%M;c_w8skDF*w`hN{(Y5K&FO(JdHvJO%CS0j)Qf5cRpYz+nXlYX$< zw(b6P)Md(m4pmd`v26ZpT8ZI_;JkQkWRdO_g0jk)eB(L!bL~@p!z8&#?sU6%wwdG` zJh!)y{K_`&EC=^;aqGoUS(X@r#pR}W!mVl9_-9+X*y=xjE!x{#lmp9m+E|U*7+?o+ zk9yCwg4*um$dY){UO#&uC=b0hA|346l)hiMYLYk4Q27eiaQE4>`HJoUBm-ioRhyZ71HAjWg`C z6(bnURJM5RVlZ07Be;ZZE80(SQ0VqrjlNoQA&=3|HPK6JY*cSikPJ7UMxmah|xRdkq}a6Pc~rNwF9Nqh#QH zewEm3wqrJI62+IuPj11P)9xwjUZ!Y6Z zvMm`d5UUZ4_Q|K}TD^tj4-;unL2~L=IiZlQJu&0-rJ7HsHWa| zi)km5y`*ic$MEykl`vaoa23>&JNlZCEJ-MqW_^M%-@=C=cHo+Xvnv!N51S{wL^ppH zer%N7 zVV2Gq5g;F!phVG3CF;CzcH{J@nS(QPwMTzJQN@y@cgeKv0QMOqe>zP0o5w{Us}B8W z7{GsatoaP>hR>%;7{2DpVK)MddgC8=`P7Ve7jm4d?ZE#4KVQn4fPn_{n1s8Wp6XPJ zLncFz!x+zMt1ODL6(?uR;6H!q zt1`~{qm&mVqwc5q*GUX^mn$TjgAIX{vE`02k7}sfsEQ&ciIPC*Rz@g!&N_7-pW#wp ziFZW{$Q_l6@7>2EI3JfNH)@l8*Zr3JEUG-AycI9?UG zA9#j7x#K-)H*Fr{IXlLC1i7=8W`^emISg-*$Ef6gG3!!ZC79Eq7YtpDUPS92GLr2Z zPo_rS#8O$TH!~eJ@ne#HQW=5gpSrpAJbqO{Za&bLHm&m=z{E$YX9M!csm0Uu;Qkepdk(`Kfmkfb zy*<~pOS1${?KlkSBSRdGott`b*W7VaMRf69iJBECyGC4bpZ>i;IcI_ye9iG;R|EKc zs!Lg6a~d(jV;OH;(B0OfM@-tRaoY&j0zSKk8>+llk=@8~yX8~cf@>=B1)47{Smcq# zCB6KX*2FRS695cR2qCEeRaNJnDoDx>Hel2+Pv=>znfZT}Bv%m2Z0QRt&yK&{ ztr?w!x`XYp7~R!DI7}bbmfZ|ch+!;S8$)NMGC1Y(^6m$q9qL6b5{;clb4N=B%CU&; zZuZR~B=6})8!h+g0$6ZK$sV;7k*Y-G#^2uLbrmBn+@3NjDd%Aci|5RJ zAG>FqRLSTxR)Se3yFnaVwy*?afmWtb1kVhaVH@&u`TiAGPLgel8hEJ_kft2;hV|zg^OFt+!pGwEJmUzGf61nS&-CJ3%BR+6UHwm}#{c8f!=6K|l z7H3b8hUTGY$GSBU3-h=E+?sQQ4&GZ9U*RNTtJ%zDje+64D&odGvPyKVEK4fO8E6lv zs>p?T_N_27ybKzvC(FBNsjytD9L|sOFL=bD`kK{WZDtUDGn(9*LUXN**7k}2mH0QaNjL7P~vi=;A4>3?Nq#kXi2DE{gxRbj?yMjtQI&x zGPgocsrAiCEu&r8tTPy+mM9uVb_W}Va5(BuOw?^HtZ)de*<20JBbQ-??6^HoeEZaA zP%}a-A}EPA8RU_2$&cX&)A`eC;xp#b<`!*p}Ma&fL_+zCDcqrmjL#cNX>$!uLB<%k!gD zJrDO!p{#q0d&ylSlshl_wRSo8#dV%0Sa4#uNeEHqsmShm$of|)Jn}f&?ZWn@cd*O4 zR-Bg?47U(TF!I_~5zG{-FWzI{09Mwmt!Y|p2z1*u^WG5_9ZN*vmlC%p`>dc3dgrw5 zQ~NE9vB0l2#7z-)Cn}A@V;|=gQ&O5wJX%P9cWJsf<5t|Mla|_j%{O2tzVS8uki!*? zwVkP4tN9knp@;B}bN>L>H7M~liSYHr(_Y;TTVYt>kqnWui~vdFwtkhGkSh_v&Uyh` zka#V%FClKFjz2jAN9C|B%V2K)@7La@HSpD7x zDs3x4Fk3_!ad9ErBJRr*)Dey_bM?(`*~w?3*j(E!oL5jq6p^HI#IZZ?Y;IWy+}lXc zap_J-;*#NJvRPqB$Zd`HmA<3it5zn;k&DeP5gq=Yc`R}V^1QXc1yA?SrgBC-D$I7; zgz{Y}v@=2>j#Y|TK`F^3Q3uZLW2kVMU%SW-gM#K!b+>IQ&#ycLB8eAwS8Hfueeo@P3r3u?$C)H2cG?&1b zJ4z8`DmDOA`t$VdS-u_9?{wSCb7+9a{A@c{MyAo$cw|`bOIwwQTq`zPp}-!4p{#um zT1mAjA&H|~C{QVIyH4-!()MRw7dsRA9T3g%AbtSNdH}X{#h+K^H_dHgO<)Ud; zZ0d}6Gcp7?1MsX}d`Gs)ZW2Qjn`My#;QY#`gY={{ayDXQ)SyI6e8qA8&pEBjw~ink z&Ni_A9GrI;siz@8f+qHUjlCNP2l2$_u zEw&Wd^PtCE`d1g?D*_s8_kFK|=%<~_?0tWoG_(YBO2FF(P+f7e9J_j{Pa1%N)LJ=mHp>f}rk4Qa+ieX1SW>6=Q%!6EELATYE%*4vOxn0}4mK2Q&*|c*e_JT44Zajm#911`= zAN?Q*vQe3d{{VX=4104(W>X?XGy~<7;04JG)E}YgQrtviadQz-5X87+!ES$;{*_+Q zmhR7J&zz0(5P3bp_w=SBjP}AA85Od*J4x;B_)^}(8r>b4NXn3z9T*Pvp>gwBqqBdj ze#54E1NwedPVF7ot7nDWp!FFPtO%g-gin@0QtT z8*}oGK{)H{Rp*IPG9-<|I^s4Q4a3%ziZU5U?M)f0<$g|j)GWMI*kl~B%_?%(q&WtG z_q);pRzidi#+y=PGIB|8=}g4{IXE@Y=$f6Sovp>S)Dp=Z!vMxM0Q-Jb&Sw>N)?JD^ zWE{{3qg=@df<3qC9^`K<7Ucr+!#$38#dFqnkxQvt$vk3IELbUhxqhIN=m_W5s@c3Z zNF&`GQV+UYrfX|$wZ5k&`Ig==yH%NaOr4~hdVgBG-ZmFenw6@_CCsr$6cfndod-hv zlG!|SSg!IkIhntF%e`4na@pfPhPq8hP|#Dt^ChK}bKcE6r=KArSc2o{z~gHU2kTkd zK8Tu>istSU4tFwc8`OWh)YRLSH!Alz_}rCYB8CsaW7zuDyDdXgvzZ>o?sRs*z;paO z@;$Mdzp3cSX>lT3%jd=pa#w4Nb|azsiiS95wXvbZdp3v!*>oZD{J<4rl9Tx{D z@f;EO*0zs%47-|JQqn}q-db|3f8F(_-L(?bt0lVG>ZvSbtZaeVx@;tL$EupAuf=e) zMRD>AV9bIw=^Nk^$8Ffp;ZMG~NoLA#0e)N&im`0#b!OJmgC_V&PRVYES zoN@l&rBxOWEx?VOV4d96;z^kT2N~;;N7@3)rWM>n0ngW(zFV?x$*eTDx404+@`Lz$ z8oIDW7Cf~Id!Jf*#`*Ibg1rEx2=B;WyVk10xDE*Pp~gdDs#9=`%WRg}NjlEoc}{y( zTia`!c3EOj(PtTB%}&FyEK7v}`98FOI?CHm%0?-RA!S|MIULf(dXU~mlObd$2NX0o z=n~jNISn#`YDi^&F6Z+P(xv5rUcLFKtgY5r zytZs+nhO=ok~@!(s2|>_FD{B)v`HH}U=qM|t&58&{N+ZFAaCL)0DWsB>PKlC%Tus? zt5Vp~9cwC(Qe6X(CK@o@f!?c04%Isy+0V;b7EG^c9lHP}j(zU_+kWj- zRoIzv?Vc#wSz~33EXr})HNsr7$9XUVZui~!*Qv>K72ckZ5L743q;x&2%)CQv_WG2E zX$hUm-o~PONFqB=E#_c#&23=g70y^)2&ZWZ7CUeNIKlO;(-I z>5TO3e;R(L9PnL6#fTeK7Gk>ul4Fte^{Td#OKly>-9%kBAv>3G3fMUZ_&nIKR3xpR#LwFOJ_(^SBsVaXBB-VhVd>WOP?$(Wu-DlwSECBX*EFehHF(@RIZg;~yR-}e_zLPFwzo?wdzd7|E}_r?_kkQ% zcA*(N+z>D_F-xEvg6Yt~6i`AwXvy;q;f{(nf$VpD-Riy6OB$@Ot2!9~fA&ZBkF{^X ztHTntw%E#C-3B`pWG>yL9P&MII6Zj9J>s{TIrM_Y%}7tQ&9{EpSNo(6Rq@l*(Q6Fl zq0gNw7CTNh=khhw_;XN>?pWc!gd3JnsUYWJ{{Z!In$Dj^x`rot;qvE?Z|=uAL(u-U z&d;jJVSdRfGRMJ=MfJz{&?_F9X{n*r;jo%1qh$=yXOO{uUzl~>*MnI4&F#&lu49E> zBzCuVBXCZoa6Xx=-wybl@85Oj{Ddvpr&-GK%2#VQxyN5#wXLX&d)4xyOrsTW7v$pr zZU;YFq`>2#u$pO-D`jSRUw^JK&*W=n%fqhIFu3_a?uR`807}1MYcvh^`*a>o;~Nv$ zj-sqxYj&{9adr|Fc8L6i^25^$-xU^e9meW)HK9+ZUmZz!>Wi_V`(Elk*vurl56Qc)YC@$U(NEsJFqxF z4}9mEs{&^fsWg$hJmBuP$=Wb>h0i~QNobMGK+fQsN;W;J<+Kw<-dvIVyEgfNAV}b`@=>BZdC}Kh~p;N0v8-X%4avl=aG;-_Et# z+Ba>-Anx6ed(w#FSrzuFalm|YMS|rhTJtiymGVI8Ajg(h(03JsX>`y_BaRkdya#dC zx-DAc$g}xyT0^jI>F->dNJ!d8LqxPgTB7=1w2~?oR9D)Ck45K$_*3G)2IU>pG>h`N z9T@;4^Q_ijM^0+?oRCPPD~VBwC7b0V7#w{~P3}7+JV_vbHBQqY+&-Svtmft^V|QK< zym{=yo<6*f#)sP-lu0=q74GRmXVwHULB{v4h{Y6^1)?}9D z*nEQ|4^S&40a8f-)j%r0E3{D~V`N~eoO9PS)36#5Tp~#-9z>BcZuT^hMCmkd8?cd1 zL0+Pv5Fwf%CPfAy#PtB4-&&hay^_ZHS~tUTs{a7p&l#t28uH$1NpA_3Q3z}mK3MZO z?mN^Tb;a`;rnr%|gBWM~-G5GMwa3i+x0y0`9JYOpTQ{p`brVSe{ov1RNzfhzC#J(% zj<~v>=688c_LHu9Zt0QfPg~oKK1P)Z5-V*Wae};O^ruOyLFE*J(L;R0c;GKg8l-PT zcOmu`-~t@ycO2*HdG@Nk#cLMY0PNQek(Taz#4IXnx;q1oDWG%)8{qe6#QT zsrHxFcW*QfjrK>0nZqBLIL6cNLCy!c6`6GkN98+A7)iM#;N=}pA5hDTem<01;9BuT zFO3Z+Py|JIIO7lDxvcb0{om+R( zq*$MJQ!K}|7ca6B89sf#tqV;7v59!g6GmNz53jvvc$~;u6y=nt=N)_2fVUHrWjHRX z-~DRZy3^y7Lf0_w3m;$QN~YHu-1GZYUnq^ZKDDiob;=fmg3^=VJB&9l6wv+ox{f&KR0~+XaZj>eMMQZm_vCR1JHtN zJMF5>snK)EH6PlVHX&Rv&mT%6^&O7aURTsD?c@i{Fytr$l?R&UWR2S^z+8T{+IVfD zpG}V9DEAy~X7xS)0K%o!JSHwge`8Y%Kg`Sqcs|smxmb@mpAnoH0rkZ#mZuDNMJADy z7~2hu{M=*z0M}8=rD|6f6NoJ4nPPH|$hiR3{UgE?HORQuW0pHWg}k_22{(TV;Pf7} zDcGxMb~=5f&Y@vF)M^YyW|wjxDR z^sQYsc`t2s2m=&NtMZeL{{S)mLaTUKMRsQ%R`vtrcI@17^y+GF7G0&Ba9CTNvg{3V z$&a7$rF(82P1AE@6~bAIyLlE{W!WOB+l{!u{^=wW_}3?SY;^m{p^;;GQ*K=dV17_W z593+(k2S@?xRuG_N&bEzi zJjr%kdHDbghK!C+)F0u>9-Qx0B z0*%OsiyXC;3%qNwyM~aUqSdfzI>(6{`fkb(fv@T@{AKOqOrxDZNA}3oWdc zuB=uD1wx)1j^>CYc`l=S$;M%eV#c5km$1z@?Nh;TBO*o;Xtprh_XmHWs<%FHw~AMX zazEXCU^Y0$X>GX65?@1V>g)^ToGNuSk9p!dnePMJLL!8o19DGJzvsP2sQD~K%r{y8 z0G4vbc-!fWbq1@8MT^gdc@LaXS`Q_It{7w~{Q2%Ey@JE%xzr(=Sww;rFXpcQ0OgPC zRW#{rgz`P}-Lh}oq5!$~#_pAB?eBGq#=5(_MwZ?MR%RhUjebx)`0Of-^Tzg5#}HRn zIP*652l#y`xXD=(ZDKDvU@ikHkXw;ng|A=AjTNq+G-5T6YYw}c0=$S_talPhs0@tR z>)y6gUXt44ONgdPkpilRUcEE@XiZ!iJtI&3&6-B?9HMiBjGBGQEcSO6(mozoK5-l1 z;Qs*ntDn+*O&#nFbBKe+Nb=nE9<^9%mh#?0B9M_P72CJ&BKm&4YOGC;*4!nnjkUAL z3&|cMKse7~n&$P(*MjO*X+k1kFp+-Y<0p=wZePznwQ9~-bvuui+(xFWf8&QNgs3{?|yt& zM+T&tEW$L4e6g_ej54dnzbW1`{9uwgA6nz#)9xj2E*&yL*-)SkPqlP$==V^-yRD2o zkfEkP#kzFrF;b+4&VtJRU$VrCS)&pxsAo`HJx<2!x1hn#dds`gOwuwY!TZ4b5qGgi zX22Q#?go9aP~FJZ*D%F@a?#7rn7hU@I0K4?ZE6V-yLNrjr#(NNpBl8OcCj)WsT%K3VyL5RZU_N2own6Lk zu9gcid2Kp7M#e=}0$<@`aq^FDIjVALPiF6XCdFuoiPLUF9?hS8QGG0d8a^MLEUfOQ zkLQ`>MP?&ASFhoW)rhZT)V0Vh3~{PPjUvex%ESd?3m;H<{A(^)o-0=Jn5UT$X(Ln4 z4?JVqsd!=5c<*6>4&Nv&Qa|@u##=qHihCh>j+Y_ZIo*?-)Dp;%sb^rSI+Ih~Ad2o- z)Ts*Pa;GD&Z))VVO--(*XjA2h7?1b8>Ae9fn) z;lN;Ue;Ram7k6;HO(y8$P)I$8YO5*``^nC0IZFE$m5o~sYs&^Z&)z5>BM=Tft3LP1 zd#PR?W8L1Xq)$A7BO9Fb6wK{VbI)<xE^F;KzkE_ zDhJ!OPau&`wlRs9%-ge^=cOWxmlq#sg#<~Jh%2|CB>w=BOMN`4EUR!NEra9}i~xGk zYY*A=WVb9BKgfFE=aKy?<->igbtn~bQTN54UjG2ruTo5O-)ybD0ZWL=D`(tN%L}v) z$LA}xQ=UNSjDA%wkpc5uDOD?&fo{Ez;wo6?UCNIb0mOUGae}?a)KsK1A1tqe4o4U? zr9tJ1_o-(QvpYr~@3slfF9v@Wiz+6E3@1<`3K-JQ#wHq;%(rsO^1Vu)mxns+Q}h;+AE8J z^GS0){(Y4}!2D>}U@BhPM`vwq8XU;$z1um*pses(DS>W#*1Jz@agtCd+5iWy9+jI6 z&|A)T#-e?c~>f;3mc}%A^B9}mK%xZ z>6)KPx=1xLW^%EV+Kfj|X*Fe?rxMAAS05uULON1T?T1nnc;hzf8QNH7oc3-{_)|+X zfWpKixmxt(_+Lt--edG$p-W!v}EAIw)Q_Xc^|SvcHkn@y6D zyJ>pxTJ2bYcm5+%wcXD3FCI{JA!?SEKu5qWo&i3Uvv2#!(jYc5?#F{yc818s&)OIF zFPw9NF;m&d8s$le_l$hCS8gs##DEd{*FoXY9FK4%`C3kdpGv4bZiS(LR0dX!hDh2t zJ*yq8kzALIHhp_nLe{2HFFYaO=dD*oL`IrU#Qi7s_3huySA5aCmrf`V%*W!d`Pe=!wi{qJ5JMFtLoFsya@mMdzef43R88ME zr7hOGkI?7z?FGZ^m4MsG&!tVRLwGFRpn!-}gf7P&MMZrr+_Pvjilc$U5&dYIyf1KCGsrI@E>t*N|qqS0L3dY#|DUwK$u1Tzec`(jN81<%!%#HIM#*3h_uWxlN#19pu zU+EIQ#b9{~7cEd(=-2nG#gIlYPWgEJhf_wK4EFahO$579MjVbC-}A1j=fyU~gkNPI zc{{^9bRC60S1p3}F|K?x1j`MShFhiBjT^9EpC3P8dV+rp-+yPcs}qURK#q>fH zP7Qh;t>wP0WL6uSTW54>9xI6fc^e@7&5vw~g6`}?b9ioUt-xRPaKrN|j<%)NT-6_ zjfy0dSjgRTj=1KurZ=*;?so#+W!-@q`ww4ww)=ql%QcP^TR6P9n-3(NKkt$1eQPsO zlzGxeACz0}vm)b+XZ5N&o~|uzp|!BPX<_+;Ic?0x`Qw_J+fqxr7>Z;v$gK11%WhtF z=kle?pcuDSUS6FzwaZCxK6G#4eDVRVGhT58(n7K`I^|!}9>0(HSEJtFt-P|y1+uaz zXDHm~1Lpq#AI7|+R@5I=)vj-y0=>kA_Af=xVOh&UqFvFo$+VPc;fgm6D3I?RaHVnd z=cjyAP%XhPAl^ymHJqoM0m!YZ8(|~^ou{c3lfHx4cu3JoBH(nW$k`hJJMJgirI0*p zBeqpnzqMyx&ZxBC^)MYx_giv z(+o~mbxnpo!p-KkZNP~0$vrChfHYwj1n@KKQqLmWMz+Y(888{Lpap*T6^*D|w3kcs zuzc=vf4%Khu&mkDTfo{|70go0EMWYPpDX+btrckEw@B)IvH%*fqajz67JmtMHN z(%{pklQytV=d4G8xQr8;&cD}YOIEW&h9K%lBbLG7n$5b@lq-1<>`8j(_=bN=V2bJv>0VnC!0@?kT@75jE3qe-{{XrN>r_9tq14zy zv2AwwaOZ>RTCG*cH=kP5?XDWy8HMf0=_GCP_2X_j)-A@j<^KR_$2EjP;@|{U2M+@) z1Dy3a#t5vN`)LwSw#HQ#Z~!j_O+-UXM5MOX9Jjq3$!vnit-48P^R7cUW6S(d*61059 zC(eX)Aos4TSkzZjjkMb+URmbbw&~bMouvK&e!i8?>J4pbi5;Qy9!OWsWBdpBlzv@( z=}zTGLWEKk&JA9l%WE$?2j;*S>5GnMnT|Qq-(cUn<#YYh z+uDIKkwnw$472|2kO>>nNWtUKR1I-2n=wV@U(U$u#~-Latyt8f`z69kT*$HlvI2Sb z{6E5~82-%&K!m$su?L{;KU%$nM&dsyT?h%ilrDK34%BX7rHAnVKM(RKfe8D+Z~LQo z;{;>$6zSV$pAp#WAaVnN!5)-`LnKUri30`pk~(CJO_>I?^(f?MPcI z??4iH#PvC+E8(zC0jNKRG@-IO&;`k545~W{fx3b!Fmf?U2^0ZtFPa$-pyr}be)pwG z6b>LVGO$(|Q`ppHPg($x?lWz-9OtikyK8+c@PR1b95_Gj^Zbox$k8-SwQzXr>sGXP zw+$?F=MKAlzZy&~Y+q|4@mDs{!nw!@pD*F*ShCzINHzkfdmrIG)ccNHq#L}g-y)v1 zT4-HJS3PO1GzMM&09v0i=Wsh%995Vvwk*(GM80bsuK4F8q5gG|Z!@MNEJF77HEu~& zL~rE`?bz+>=zh4Ta61{Ib;p>$d~O-;E0vo{xEBu^E=N#(tI+2k+5sh-<&7|jj(~bq z?H^3ETZta!kXthqSqpLoDY(klBKAD%Ms|v3iN*lV>eOjn$jHWPPV+*B80DD`-Rw$_ zUTYp~!UQB>u5c4v`JStXru97WN(fO}O5H7CN` zLvM!-_q}RfuHm7#rCn+Ek?(Vf&+@C6T8z@h8V)1xT-WwfyR*$M$qo+CI@evI>G_RU z1fumg>smi~T@$f`c3$cuEJcpd$Q7F$WbfM(J4kXcGHcS4Lbi=VBzP4(7WUR|vOFua zH%iT2h0i0n&~M_MUF2xvf0$>bU$NB@H(DG!DC#S7$9ErTjbKzrer)AWA!~xMwZ4|@ z$EU24HZvv$aK5xcSJFeZ)z0qSB4~zJlUj00(A#{GuL=eT;~wI&G{9QQSnf+K&9ot2 z-1ArWv&aVaIN;~KYT~van@6?8&2=+m5z`!0*S0cWNP;Y=KZR=8+7kdHrEFSSd276= zE$Y;ywYctL8&2+99Bi)y-+QoLze-5 zw3F9RF6ugctI7V05~M?(2PU$vHCct2wqv>#{tRZcB8~LxWJ`(KX#n7Mu5I9%t|Z>X zb^K{rEf6BL(7e94EO5YORWraH>yPm!n`LFDs6eZ54o|7(yXU!UJuRYi3m_jZ?x#H~ z$F+%WA5@krmnD($^r(`%u}E#N?+~E%t(dJ%xXBqlwWX(cY8Y)Gzh)vf0PQEORq=iD zK(Iq=@)8MR2ROztMbg|i*nr0a64Lzc!@22In)1;O^$Gy=tQjt4kw9l-ifnP28CP}z z<0qh^>!5O-ANJHLu1D~k0sZP%y0yL9%ENbT^#-X$XJu_}vfQ_uyy0=utw*Cm(j>_v zsvdS#U5109^O|OfkM7j%-fOIy?@aLBgGlVR7uZn{KQiv%l53ulR7Y5Mjm~%(L*ZmRJeSWmQEKuoC@F8JXdPrEXhdMc4U&>GbRRb ztN7QO*xz62nk=@_l~X40Vn2k)~V%&7PI$SJN9!I>OQ2w@N-_AH(wIl4+oN z5%`Md`#E8bDWr_uMA_ZK?av^3j+Mwsb8jEX2GH62_`#;@n!K8&_VQTA8VKVK*}(39 z8q8G;z!QP%S+}W48RYAyM-MuNa+PW6UCuOPI+Xf8|U#8%C){)i{DZJ>ROOUZK z+Th`STyEq0xzE=X$c8zoqnLn3NXWs#>OCnb4#?6vjN6M3m=0Sat6fNUER5X0H3sl1 zIb<@nPj14KcNr{AGq>*8kEJx+%CEa|-|JG@Pc5tOSoZ=k$fi#T+vau67hpx_Gqzad znw-w_2-9f!R3ODkWddApc^OeuR+LF;Zz6;tr0dV{(34`hNntA=?=nweo&_vLqv&x{ zG=|Znaz^Ih4{EH}Vo3|Ks;)V}>r4q`bXDHiE!wF`D@eUM3VX1Vkl7ye>tTjuJtSSU zhEWF{YUQijs<)cCWA}ZiytYJO$apst!8EO zE#0>%KTd|J2<3Rfc_i&WdbFapm9*xMcp+p5ulm39=|U0Mc#h#)!muRCBzF@1?f(GX z&jfxI=h9pKjzn`PZv1tw$Hn@QlFv)CGrWxqSdrL<0Aql@mBxk+*a}!zfICejdp@l|FNFj$|JckcOt{0(JXTD_!&?fy~nqk?z@ikvu%=0+?t^HN&w z1&O1LV=A62t%X*T5vu_o-r{C|E37z6riDYQyk#aW>cyCeH z@x^jV`|w+>U9q@m<_Wvy!C~!zpXoxAxU6;BO!I1T3#PmB-UE~7Hw;RX&#zJ|W(e(G z0%xzm~LqsI|DBSr>jm3KkKt9R> z(;+$X$=c)|gS}f>Ryd6A&*v}!G5jZBM3oqn}PlwI`PP>GRbaiOi}MwKp5nG_ujk%x49Hmdc7JEbX^zE;|#zrY*?{8tn&WasCvGA!bnHcWxS>$X_c*U*aPU}b=$rHEj|k?EWo(be^lu zsy)$W4~(~3&=-hTo+=3H4d*Li&Ow%n~c_Fp6f;iGoEsD-kd$=!K)iyW7@=C!m2f(arX zha;_Yt7T^dDpPAYTA}!B-`8_jyG7)?j>O8tYkbO2S1%@Y2F=@%|k=-J1!iK39hghW{@&% z$m}Ym&GXw90v7pxW@0$0j9SnyavNP9I3V9Q-_1uPR^nWs8Ksi#?;{6(1yPO`mgq!Z z8;@F~wF7#_;Sn%;e-1rr<=yzg1RMqCvi|_F-V244Nto1d1zCy`GF;>urqdBgoV~)F z5CvLWwS=omKvH^|uYYrIZ5i6xZ$Vi$*AVIxZ1a>qV0uzog^khWPM&uBCu-+4*ZU-F zVI|~a#wyOH*D*w!2E-pO;n3C;h-JKw&XCBWrKQkl!_;Kd?=F&QyugX~Ph*ZNkk*yn z=gcE7+}EROo1uuW{IXs~dsio|Mu$v!SD7*jfmycqEeogkmhLF6E$59)F>``C)*py( zzqBvfY+-v3N<|kkCgz2L58}rHvo4vMMifeX!%?vU9Gq8Er>(PFDS{Gl>b*@>@YabX z?8`eQyM9U#L-G=Y)OFo-4c3)_eZA{^F{8agp4pb^%MGf;^A{Goj>vhSi5$dK{7tHy0 zH$H$?D{!=A_01f?WOc4pm@eQ@CvVcSuG40Br*B0;PGKp)tx!5O_JE zw8F;cg>3}7%rh(PF`TKZz9gAhF0R?+7XncouNe%yhi`5vz7@B4w5>~42yt$^1Cj!+ zL9Tk@;LS2bgtEu{N58EPVCrR-3r*fB*PkuRTJV1`ZFe ztx(($161l3(OW=}Lc&nLzH)x^XOF|BXUjXv!#}5bAVhhUWNoNVC(^W}O*q<# zlX6L)lmpM>Pgk(EGZi=R4>`aUB(`$Exo|Px2RN%q6mzyB8!$3Vjn<^YSWf7>B~rYC ze)TH>oKci?tG3YF1LjN}qv`Lv{3*g#g*>NGj1l)nX=K9zAdhJ2wYpQVT(DqvP|*CN zuRQ>&uCk$MnLNV(0C)cYtx6;t0s>tR3{5l!c(`y8N!khIP$Hbs&20N6Ql#^PNfQ#; zrI8(+hHR1QX`voGId*-H~VEVz{V3~~Kyb{Ia} zAKERW4T1i%Z#_Ty)Y`iy_9#gtCRa&W32YTyV|V!TRfJ0%=@@d#-BG&_V^6&MO|-1}6L2%tC_5}beas%WB(!jj+; z+ekm&pbYt7@=GvmjPN~bE&Zze&E_`Q6U$X6j`^yRLA38rnTItaHS`@~-%YsPb7>9A zHWY~lO0IrwzK7r5wIR8d_fomG(@V%2MrdLKes^bbd-04OyjLEHxyE?UZ(nNH)2;=C zvAR5vvhwl*M?HY|s*3}$l`JN=eN#=oh9!(75?TnhxgU3NUW1I1-pqX~1;lYcmi8h> zxtX0BBH7*0CO`4&&x-loiGP{*7jx5hukx!mrC_~^LhS0nCXPt>J4nyqaz9GbMX@Cd zXY!#7F<=-48+wJ$CXi1UwwX>{U6?BR{{VHr0ajXMmf9thg?P70Wf+w`hP+ls#8 zvToxasi+vxspY?!DH}*tLC$FqAd}@$CIKy;l;w41m4YbA`A22-G{iCC%f8XKjiZuk zMj0hnX5z8u@XvEhQ?xP0=R5=H^riD6SQ%G(y8)4&56z$D`qV;gzjwI7pb37^UPjy% zW6u8oEDR0U#Ya4(zSV4~JQ6WYLJMwp##^30T6A(kle0*MRKWmeA45n8!m=<0c_TPA zEK|lLX&y4z0}419@0y9FW|%wW`*Z10{{Rv9pa;9QF#iCBS9Z-k&K9FI3~8jzIMYa^ z1jyT$nvK_K_M}6#y{c2nkw6gQg<_{7j+{}}fDPE@f^*GC!Bs~?p0vhX9FCL#_|8bp zSn}9HCsrIjG2B%(I3lNwoljm5Y5>#6n`x8A^}~NU%8+8O8N?BJdxH=#2&zQppFlu| zJbKmZDEG>!-*nXwy!5RLb_Qf(c7_@JX$g@AWJZnD1_ur6Q(G0?vvxesm!6djj?aGX zS3BCC*6$KB$faLDE_;e>4?vDbn%X;PQH!Z8aasr`nt0`zHWgjGp0#T0Ng7741*~q! z=@4n(y0IDC`BjXj=)A$5tXq#vR_=G$%@tkawI*p?j-7hdTg^otHFn@Sk^ug-J^TxM zCB)K~EI<*j(yA_rb2X`!c->>h0mn+pQo1b}H`?8}h#_6X4l`KmtV)xsg#|Oy*19{N z3PpJ!2`37_cv9H-LO4nbxff|XobgJWntK%N&3_APmbb!4rH2x)%($(oBxz;&mkZm{ zvvkWV4KnFuwkaSWjw^ZRvim=p?5e|^rn)C)(@4v$k!sG-+8ydRJq;4FLfJBM4@}iq zwYi&3#k7Ab9`ymSbeqkRUCjRgF&WKEx?y8MwRqXu1{ghvs9NM(F3j)TdXvRaytP>g zDBEAX46H4~x%n4@X{MN1v2Ac5c^FBO=Zp>oUbvFxNR=HQw zn$!rjdE>V&G>mrVkz9_as34oqvzH9smD=8mY3*i+av4a;uRU!i?3V#N(%|}v$w|gV zs~r}Vu3TLphGyIcV_P=1@kuPQv+ZDV5DqJoiXXAuqsDSO*G$rdkL_;wR$aIiteSd> z`3<~*E;mNJ?yT7SCZ<95mu-aP0mW}=jIXf847}Mp9XOCJQni7_#aRs?rVw|39PQuU_4AQpLJe>MgbgZ9flgs;?j@l)YKs2k)-WF7CzfDL*xk?$tQ6@cqX210-y=YW0O=fPe&t;)ZCjG*_g=XjYW#E z6U=R+sis99=Egw!)PTiY)%4RPlz!lek5X5n`&4$*tOb|Ijz)eyVVdYs(@wNhhCI7t zc0Fp6xh>enZ5rk%%gCQ789Po*HrCeN_nB^XZ9D=#@U6S+sbPvK5RaR#MPpywOFaDm zL-Vc&K}Dx2E#-kS-HCJuviYTX=j_|!+tke4L0XKrdpA#LB* zfs^CSkTbMn@WyJi?#j)Hw+!$p*IR}VFEv^;X$c&0>qWqt0(LtS$sUybi7+^F7<3=w zKv(klJaitE;Kij1?4SX_6ac`!UoJCJ7mb%0&Uy?}$qaTXWaA{$S#!>DfyDq|G z!wfU})hn$^Nn9A%pvEvLu=Ot7!)FA70SFA(!S9c)SAul2y=gdf3P1y;XUQ|mg6v5h z$)rZl)|Jr9_jdw&P9jLdN(!osbR7QxI%Te@d19bNAqY1Or>8w?-NcrYq(ReS{{X7r zj1|XnKN_Vyym63G+DNi~U@@FiyITSzmr}XZR@w>V-{pXjvz|cu)roInd%|TyAMbX) z-i3d}8iv$Hip&)xjN@|S=TS^1BN2SJndWsoj8S&=U{2rZmaLImJ1}zIXK$rC1Q2;H z>GzjAk3D)*Qr<`-TbNz^vYWv?au27{tlYHwJj7^afj;gyDfmz;^dONpx80A9a+H?w zpFC}dILXHEU#&G;co^shH0fo;TOpBoU`b5mPy~`iwiOK?>xVhqTkenZ-l;Z1pIp*z za^;m zk6iS{AeinY#^8U4wMf9oOlp|>y(zfwN~z{6VDwy^(IX*q<$@QsXaT2^eJL|eRthux zT>UXj>SzITM^3+m2NZ<#qyoHUbH9hrDtgpRyJ|@!G8H6p1~cnTJcggz;B_=Ckt?%f zIXT8Dv?9&KL6J%io7C0om}9)t{L5r~!6pFyEre(J=CfqD7zA=KGml!((sfg%-8AhN zn{OhRrjPaf4JUmV(dqhmYncSHH<@uL9^{^tX5Il6@JC(FK{!9cL9TaCx@cv$Xrw9S zGPYB-bcs2eQOY=GVfVXYy4)4?GA&u#qBQR;LPl}|`*YT}B#s5P@>`WXb5TUOg+bar zmCtJ;^7i}ew#-%_e|aGE^sJ?4pw^}~vwoI_=-G_N7YyBT_|_yEqh8vrvq;UBr!1! zZzKUAsWfM z)|UB}Zd*CcbN06q6p>_^P$KRG_pGa}ZX39o8#h1RiU;4i+s5~ ztE6$J$zvMX#_f1M_dHjb!Jib>_LP1y_wYVRzq=0#Um5db=#ri;PDO~%(^&Aoz_ zo9z?r!tg7N)nK_>vVLK-)<=nLEv%+bDZwqy1!rlxs@p{I&61~@xV0r=`J=wmE+DsW zEgvLsD(&`Cc@})1_wrW_lD~LdNWGdg@9cwh>C8=$8Jr7UQ z?W1Gm{Dkw?xqW$oy(o>HrwxkDk6XKrAsWj5`KCr$yvA;)l1(;}*^-SF)U3{tHLU(( zuuqd`bK1F7xAWy0h^%?b9CoX3aK>`GatAdaxGXQ`~5WbiXk#vp1bgl3baA&w@L8fY}V4KOX+e3M5Z8BpAshGh}V z=r{qf%{e4;hA_uz?N{g1C%GbcS8})=`_L&QF#SG6{B+{%y)7X-<2outCl)s zHalgxbuxAxg);tWQB0QXrE+%>$rW0pE8UfDuRQA*qf`s9`seYi*O8lO&)r;77a$yW zrr_{>X*Y6-;{<`QmG#N*RvCh!Db#hQ!)tPIL{e-%)fs;+*;^rlbfE~8P9Nn65cgxw zX|cpyF%i3S)oG>jx!O4IQLWsusYG(1`cMSU<8e@>SH61FLPo5`*n^Lk9<>l=`|rp; zwJ81eTYvzd2gfOoa&KY}-uI=RRYV=w_NFON*xiK2O+@RwXFWOtMZkodwrF)AjQ6EN zyS{!$r8T&u1=z2gvb2YJUIu$q^IdO?Lq>+>~(5rx@T06VwGI<9ch6dn9MfrIP}jIbVl%rVu_5OFYgB2;;1>x zdQ^z=z<6dxBd9$^D}inNxgRjFK&T1H>U)Zdab8hzft;T88Gn~%@CAG;JXN07H$N)1_0qa~ksoTj9i$nPR^l(Z2r z$?QdGKD|BmjF7lH`U6dJ?PEQHp%aL=6k*A!lw2EW>>8Pq>%VK7*)7MEkq$`jU4D(I zM|4#re8#;_Ys(fp&EO<2yFC28D`o`LY@=3Os=2M>Rt`2`*JG{mHQ|Es7TE5G86bDA zdt1MnNZ^q;fb)!TnpTz8WP7&H9nELzdc1c9Eb)tYxl`|0EiGZArO|Gr)~(H=L=30y z`wD&Tj5Mi`-6Eq8o0D9Ah2lxHE1Q;w4dx6Cj+IwZ@fGH&a7^#R}rGSYAnrpbP^-R+S0Sspt9BEwY8KL9m(T0tBv#^tKtdnZRA*0 z*aF?^jGE2OwXps6?D5x;P1iMMxVarF<6Wik(;aIj?@VO4DyRt^MQ5`M zm`LRgd)1ilJm>Pt_Wf#i)20#b@>qe?fknNhBp62oec_6CxH1SMW@*%g1x9kz$czFv znpJJvuvB_gz4867MI0}wr!ci?a3xX3`Kz7I*2p-| zdg`aJeL~(=SyM5g!ur=Wac3>J`oIU!)IAwUgsPVz4o9h`lsO~480M5@Hxvwyr5UG{ z%^PU|fa0_v)-GX)Myxidz~ZT_EXR&VtufH6VAUyDTb3?wrn+F#F*xZ_tjqz)IH-s; zrHMeab2c(nLjAK>tkEK#IVDh39FTfdSP9$5YMN;=9^UjW;x?`|0kru}0V9uU=k5uT zb0#@Fx7MOEx$bGcaVvw;n`wzF6Us|O9FggPP^$&xu6d;-G9GX{3TkeAD9QE}BMBdS za5k@?rHRQQbKaalW5VqKbfppF1xMpR6H6aQ;4eTy>rhK3{O2??e4KQsk2C;-j%jxs zW3>byN(k-;6bx{T{F(VfbSKh@01igr*zPHSlS`A*fezSLsG(ORQ-Wk;Jerk3jYrGR ztpGh&fOrGFC~ez@K3YQCQy3(gasr<-W2Z`BHKWW;&N&==P`oZbT96Rf_o!$2M;&Oe z5V<>;XEf9-t-LOA%`}oJ0}-4aDFG>P+#Uewidg=3`HpHF)thj=&sqja_HaualgZgHA3Kn~`R^`m#aC%phDIOu6Q(q^1aKor1ALr+!QLgy657&xE?(@j{8 zEyeiZS9diFfu1P5h``5{r2i5o#9p{{vbYgO4VT1~rtU8rdRT8U^E=P~M<>j+3EjRt%e%8-vCxskW3UDIaR&gN2cAE)`>P zI(k)zHK-C3{B4vyGdv^HZj(?%gfeCKvz|A*c|k#ms-`{pw8;&%Emyh zcT)2tnNH(_>Dbpg(e7GCuB#$Tr-8h--1)rcCb`+BfXpz#lyvl|q`zAmHVpfn#xN_G zX&K)+;->bH(mRQ+;2LTryT+4aaU-dw!>UVfc@_NIUG{|MxE-sJOpbR@w{lA^(n$+e zrmhYrf2ZqqH&MO3ktA}-C(7rrsWo{m;77HJWMZc~>xcj#-IAuDYnef2-b$J!$o5te zMxLc{x!30m0PR^bOp<(?e;Q)4xF8VgPe4%Lv3#YhG}!2 z)t#-Q1nx&`ddVS8~?O_EKWMCgk(k}Wi zXxn>BXS-P}*pK#o4RYGlmsVP^%HTK6>IkjzZzZ%oY;TaqBiwpdKjLetP3vz`s;V|; zza0LxFH2ZUax~TCf=#iU9^##8CB?<-NQZ9kO2QHWh95IETTAI6P^vIRUBzz0?>yDG zl&%Tg&%Gy+4AyNkBxw7veryp`Ma+yD^YaGHR(V{Kz+$C&jM=%dgG*^-d!%r`^35BQ zn&+?Oib=n6S(C0?tu}UfTPKVUTCFOwHW(4ddYLpdOKETRtCWk(SoePubQH&5DrQi? z?KsHjX(3I`?L^8I_@pttX-UuFOB=E=+tiFxXaYo#p`qW*f2}hVnV>}Rn3gu)7{Bo2 zr7#62Jt?ct(xh;zpo##NCRI{LYJub~)ZN?>PK^T|M-%}YF$W`xK~lOPR?A?Ix;<%a z;aQsvBB!~eG91*E6S21e76D!F6#M?pXwWAmU3ak9To)X5tsXgpLZzypr7qjzd|0Q;mJ z+*8^2lD@s_Balvd&`Deo)_?|tj%o%NYPq~Jb?4HnNI1aFd{7}El8&{3?U(n^zd!p_x*k6T<5RzeZw{E+4($c&ph{Yuh?s$Yd1Vt zf{nzMk~_Oh(!Nx=m(=lIK&II?L4-1jxO=#{7W6@6LFNi8gvqC-+|kwv1@o>w6dNOW zMWWQ@lpa@_57#&_b_h5l=f?|s{oFLvI3HdHSa{h*zIV2gpQBl(2cl*dfLxZj<{m@L zc$DN)TPkdaD)e;U2!}?afWYk2FNaV6QL<#2$E1OM!4o;mtd(B zK`>Z~(J4H-9ts?>v@K`-zgU>`2|aq8?1Yka*}CoMU{@WBZod_lb0hUsj-6X7`dB`C zTM70&`{e)1Wc^L&4k2OH!%Zi(C)eGe1OOG2jKO5PM?Y5|Xhe?cWJ6}TZr6hhAXwY>X7NCD09 zlnKu37a0pCM56q1+nOpo1e_0V%Cfn&Q&AgzACh*}lR4is z$!=?SC(=M0mC^QzC51>k>ItL4A< zwtAx(NbFDQ;^6it21$YGZ{FZYqo2X|v)A4rFNNVG%XftCBbo2#f(;qRwM>)Xy`JA6 zjf(09AwXfY^eS#`320Xz78$)6s+YxKK0mgdL->gcG{>zXSfuW<)z-`T*Q3wf*V1OK zfsI~*DXmwNe)6r8<4o)55VcQs$91gJ{NO4I>l&xhQ__hzU^I7N?U}Cm_@-Ha zm3qL#xGojxQM^%^V=F3Sw{Y@k>yr4&t2#&eo;P)g$Q=* zdFk%U1B!N6wbP5UmlPhijt>*5V_J*nBf!fo#!l}CfnAsu2yK;<_Lw`OGeA0`O6s6t zfAv05CXrX_7_siY#o2PtGq*rsXK&&jETR{mQI!Hz6}6h-b3$4|nFws!VY`j+4(41E z2~$pS%naA$SYeI${0u*J>}+RoNKxoZBKv3ziT+_Gkntn#i8kg$s>zLB!Q#(bbHVgI zR0JY~0(oJyh=L-ur%wa~MR%aoY&`2)#P&>ui8^!9OC35*xyOlet~x38AvYuA`Jski z-?7|8X?8mMr`}fx?b61Z5~IxiT<47OZI`THVTU3{DrBG{*`6SnPLN67DL9m>gB8*4Q0r z%z$vTk@O7j4phY)S=IkTiFkN%!GCK0ezc+56HM{@&;74;tUi*^TToie&y{2|ZKfnX zzym@*Z^Tc;$v$yT7x3Dz;yow;ojtwXseW3^8MCWG4_Tegl%b*N5n#mpKskP`j>V}c zYTE-+$JCg6D_GP^021_@s8)yuNt9`%%-|jPje;VeCC-gHOtU8#(brf8^KN>M8?ge* zKPW!XoGi};zgTv5FQI;;^=9S{q!e@iot}WjZ><3dg4pchQlkjFcuAOVPHGk54->+~ zdpK1kBI=G=O)`uP<6oTw3+R|Kco=;a)OL*tKe|p|g`anAw3)-Ryi*U2BxN>HbS}oZ zTdoI&-x&$yZI~&~!5ZM{yzB|Y`V6*s8H2D;o2zr6z zAz^DcoBgvh-Y=ug=3$D!ZV&PHKygrsopZ5hZcWiqTF-S=05ae$eJ~UPQ2c#!jWLHr<-#R(cVK)6~`(ShFy_Ne| z=&;?ukOAk8R@k$*a#kv(+>NM8McxSowf z)Fn;|MeE4CuS!?8%Q4nAoR9vTVDye%P$ZU>d}o1i zIO+myfQRXZ-RntQB1a%Lg;2_#&%wgprlnjWP6U@nf#!A!)CI zbV?pbF3}97ceVpyy>* zEOV~;r##XgknWs^yaZlGtq;`rZEx_yoDa6e(c_<-{1C6ilK0dxZ63I( zYD|o$AE*+D6D}qx7TWN1O(Q_M76|R_>5r4-t{sLTS;q{Fj>2yu6TxdC2qJsqQSske zA6;HfGp9!k`fMLNDUbSTFg}kbH-|iBR_Lcxuh>sY^dO z4-Oc~X;t56ooId3=$jdukj=tOol9jG`B}(+g8qGB&H;|u0lAEbR`i+x0(9^7u)h{2 zU@6f*7u{|wNsN8BMWEp>JWkp;^h{8cfLW7}GeevM!L*q*fD*9|ewaD=ng0M+Ru;cU zW-f;W;yO5~j@8f7R^V-BXvS_A2gf*^Z!Ief=?yYHU4gsjo)UWeR=!6r!?Io~;)uE& z?TGZYp`mIj?YLAJVOj9gigZbP5Z{$>^XuW+@y)s{j%(0)onV2BNQ&|AK^vHlx|FGE z@I&L`iGyhKbkbCQ0Rj5xrz>a0SefC`1v2jmDO`Cq&`J=bWF*ay%^H&uSLvOxRVwX^tzg7ndCjm1pUWd-5|yGKV9n z@eag-6z?j(9@m@*ry$G4)g;&CnDolU#)E7)Qcl?B-j_%z*g{q+xKPMfxq7iY##_2-$KVMdhQ|VTe7|CJR7(KRB$5m@}k4|KWs24^s71pM_mp+W` ztMv3EVVF)VY5MnUKhr3S^3U zWl`%@%Jz*bX$iq(lx&OZ7E>U2S|Z^%B(Og;j$l4xSdNm+aw#BrPC~}}?Ck6vH;=IE z_g)!@uIqVr(J8B`{o&BVhUk5~N;wy=R~<0dY+zS8^P``3b*zHVq>s7q`~r@Qe4mYT zj!4Ns6qy|vn`H4Bj0H?Obc`kysE1gd;3Ak!_wex--c2|mE$fnr`|2QCMzhwT%XF;1 z^f*ntDj5od8(T9F4aFYO_xe%{GDcObIZej$lBf9y>3GtMVy5r;2#{V$&kR`)L>;tD z??36p{B%-789$0d2BMCMFk~vksy~`5l=(@+qoI6n7H?Ob%PAj$O%D_SN|1BA8j}Gj zy>(RslJ_ReA($`)pOh5IMQg}lkrchO%@4sdR1TQb5bt^Zp$?)>gBL1S2hk+0y@@SA zFco7YH1LR2?c_ELv|wz@KtM|+8756U8uHRRjgT+}h=ww@_Mp~)vm=HK1Ouz8w@kYN z%#@2O&%vpIGzJ==_ed??f@sL&(R*k?bTOvdoMz{5fhLKDht+to5`FU#7W9yn!V?)t zaE^7)0_6AW=o~|P?jE}SR$T-WMwJwCSYI8d`*6hwBhWYOM0!~<&u`8Wh2mX=c&euw z;za7ItFc~(%yOJ02QrWj85(BnY=5n(qlkS1;BfojR2^2-9HK0)XqSv*8`cq&32b%i}dCvsWTQ9J&%SIDDeen$1sthMJYXtKt4iTQr3(WLxIFt;P%4Asi1Z%@1o#zxxTca<=q% z<8JlZ>F*~Ob9*osCoh--{P&X}Coc#8|7_;r`){j=2$)^g#rX~Re?IVnIsfj{{C6`C z_@DFtTjS&g^9cU4Q%nrZuKxPX8!LC`zi0kk)mJmOx3c*6LJCf=ZLIjfTwMQN>7P|j z9xyM*|2|p9=Z&1!o7Zow{%Kgk(be`f2bh!d-{b#Tx;Xyr#v6A} zV27Ta!o?CeL)psN2DmEMe>WEB+<)KE-R#{8=%J#l zf-DFF69mEl{y=y0ASn<&E-oG}4n7_p-o1PH;0F+b2lwwkAbUhY1fe3Qeo947NlC-- zl97h)1wADt6F>6{4o)5(9%@EGQ2{QIm)tyDe=mY@@7}!!_aBfF5Rh`wQqpq$AHVK8 zK*ac%!dNs|7<3>^Vhk)|jJs|SHPBBSjDKFB|6UlFSlBqYc=-3g_kjcI9)d72u&^+( zv2bv(v4Nulf&CzCVw^{`oD#Ss8n5x_+#p;b33>SRlGWd!nq!9y+~)4UQ*Mu+kdl!< zd(Ozj%)-OV$1fl#Bqc2)D<`j@sHLr=tEX>ZXklq(ZDVU^@A1ad%iG7-FZ5klctqrT zL}F5MO6rHS^o;z1!lL4m(z5a|HMMp14UJ7-JG;7jdi(mn4~$PtPEF6u{+wG~L#=OY zZvEQcIXXT$J^OQhae4JuE({RXzhVLZ{}t^2BNs6s7bZ3~7B=2rxiB!jfj1U0HV!Q( z?js2eyw`3dbX+0$5XpqR>Tmbxxit@==I&$Q#|%8H&yN0z_K#%${{(yce*6qG^SEM;)T^c^2cYAd6OL1i}?(?-4ei>&E55kh5 zhY8YHgcoesfIiM4HL3pLHN zWu7xJ*g|qOLlkj9qB6dHeI#+1b8#k$uv<*U(=g~}4%M&-0gw#v^)ov{H_q<%M(dFv-bxD_(=Tz z{DH2u*L1$}3H!qNWi|+DIz44=$pA=d$2kPzOV&WBGuL zVBIo{-hy{4g`s~JVx987aPauCKzV5YWKH^NIwGl2%XET^woG~SN^ccxv6`ADcO)_d zq{t%hie^o^&>@am&5kW;OlUu**E5zeNFatRXan~)jd(2^hqR03lLPxZ+oPXegTP#%0FyN6A?cM3AZr5$eKg)s{jA%}e{Cjw1}g5C`M=uZ`B4LQ^&d-+g1YwCs~7{6e5q_202b_HmAHM?br-X_e)G*oDcr~K=XqImJwgLc~?(C(PZ^>$Fp$geembA``{WbI%fQm1Qohq3WPedn^G(j);IL? z@0&iIR_Hx>)@ZARM!~9zHAuEiH1g|Bv)*ZyaYodRwrMCa|8UekP9Sqp@etiqAf4f5 z=MR_3%86+iwC8DSxzm@hz=v=a2e`%iVrJOv7GG~d z+|7*Bh|;X*4rHQt2a<%IfhcvP-#c>yBMZO`b0C&N6GIg<7y?d^;Q`(t5;q5a>f?X$ z;`wJmWaTrmh7ahb2J~Bp^(~YjB)u?*I1^f4eH75N-2xZ-LP*H+C3*?oQ5soP{fC2^ z=?W~#B17G541WV>QP$!n#bJW%0YG6PUIH|e397cK36Y63 z{8tH59cy&v!;LqK2z_y3sG9?-VfIo*-fiLX(;2TPSh_{cxsQh0*X;6qqD-^IEr{jQGs&)VA5U!M-v7g zZ9EwJ4~YYPZ$WXbdi%#4*}K;wxsQWr>EkjM;c`f8`(vV?M{Y&CebYv6cc55a|NG!C z_%VE$w#8U(s+iH5A>xFFmwjs|Hfu6>plc5C_KS@`8k~T!BX_lnCZQ_$JL&dI0(;Yr z20@%&2PLI3e@OL+W6>_b=u(Ckr!Jx#rFF+jNtteCb0@k=Av! z^%Ir!^>uFrUPU|VNFd%G%qFegeyR_`82eoklb<1=_?2JB>G!uv6D8fx;}(nl?$y=T z)p6AigYlJFO!Fd{n^$)Vt$ zUZCHb4SSwSY%2Ul%J<1)wX&bKe%XBA14A~FHvx8;kA2C^C> z1L{|$&<}MrlM_F4EXPG0EhrE&{oBsFH{ysIqKhD2Hz6+w4SYf&j2cwe@Pw(${}?%S`Ju#6?z7Ul(zkTX#o02n z5E=@ECP(oHNX$6_=#QAkQRZ8-Cp%6#mrnJc>!YWU@6+1f^D?5(x`4R!enyZs6q;AI z+A6T@kl-%ayVH&4J=NB~7P2TFGnHDouYGKp6+B3O=0GM9DXXTMJaLbNqjXmL2MZmL zvx=^Ou81zp79&`zqYyLuFB#CDk-O2lin#S}IIcjj)Hiak-b%$?dc!xwQC~%H<{gu7 zHVaa`JrtwClaJ3?03tI%eKG|H$dfuip1kPMnOU%DB>Elej=Gp(p>+6KFtZwjP=ynW zBLD`KV2FdDbO>=4KoP8hxiMbH+{6T=p%_k<+Uj4YS>A!%+EW5zZ-hg3{Kl;lE@Ts8 zg{HIt@M*t*8Z4#=l#)ahpwEaQabL89e!~{22a=#P)O0{*DOd_EWDHxCf|_|VvOE3q zNKtF^`6<@W!uDA!=Sm>ObJCF?d*&9;i z^NSn!nnXUbav|sHW&HHk@H=al-YP{mEE}>OJ?){s5?<>A`(Drx|5tLks-O`IG1{rP zqPKB32-f8^y|+Oltal)iR;TvuVc0FH!!mie0vRNEM-imxS_s@>A`R1LM=NDXc2i&C>s5U&=cc7gf<9-OSGM#i z!BD%egLH9vb1}7Vb+2XhNrm94&Vl@W8aGpt@(a`Do*c3kiKn>@lAR08))5{{)vrn@KlvpUB8*}5u0?2YS zSvTEWtZ1bpy!G}OMArCBltuHc_9fu0S6)>f~S)uk^;eOyBYkr zg%u_CJ;1=3;*(eCnT6>-PQc|+tgOpZuCm@i{l2Y?R4R=v%V7EP27(WYbq!dRnt<_9 zzg8~wc>d7GQ5i2~?e8Uzo za9D<-D6%l6#UdDBT|!e8FW>8gxBReU>-{6-wF?vl11OuV_<*7>BrO+r?IZi({TvBR z^Sl;e$4wV$*gyz1*UtFmT+w{Cx9S$aM9Re<=tQ;f{;1s9gBw=KKCa<0q5|d}kyYDU zKIgSo@PMwt6W&fA2A1uL+ck%liB&txr;}`*vSaJk1-#6pq@_c)5@e807D?vtU~+&P ziZkyH0l26;9dRSi4h;XrJ5Y*|hR#_CFhnQ-v^a-vNX@z}QVS2!%0jW-skW$oH9G5U zs~1@HNVvF-aa^0j+u8wTvRpeLblZ$c*3GB=0C@sELl7+@$190x3&8-}Rm8yrF?Xl~ z&4ECFwaYSyI#%TE1b}VJ18tHCDOtm?NC51wZsHc(uS+iV`kweIz31?G0ncJJNzM7H zL`^fP2pb2&x}UW(e@!F0(h9US3KjdIam;vKaH&ZS{2T&ps z;ZK-ES&U-sExZzB>7CUJguXqB{1)^emv5-oXq`W;IYORgXK!J*7!GUMB0o9Y`Lc5q zuMNA<5?=7LgD2?VI$6yH-vb3zfJEX;sA7gFGGXkR${n2rkDo$C3AzCkrG)NQn8L0l z+)8}l6T<*zN*JoW?(^u74*iW+_k^g4ds&n@v6$NRtSEC|BRZ4Nc87ZoZWEGaWC-w7 zvG}-~$f*Nm*3NbtfH#cSor~r*P+wfnG=DDcr=yG4iXqK4H~&w+&P9SQ{`8?A-SSGT z+jXGObG(6=`E9-zr`|qmKj7Y~YPY1PY4qP`vp)H`864xTH25_OgWg^1@L{v8pKV5- zGyf4wyv4Wd7II!HJXa5>>*zBk>MWm||4F60(8uk};{FLX_i0Mb#zR#* zxQFK}LbMzC;*%10(|3!dPd()Km+;771C_PeCWpd4I?itkumXCHLYwPeGaFZs>3+|7 zXiQZY^jp=zlkXnx-a5ypd037~qN#jr$toX_5xY$o(|r(MDH=dj>a@e5N|BbIqyJ?uoU_# z8NVz-?p_h^e7OBZ@BVspOhWF><3#SzB-_G8I8U<*ZxmQNTG*z$dq9wA2^B1E1mmRjf_UZ)7JAgAP&wMdLY@ zM$ZRcL#eN>oO30wd&I>Y3Q51O1l2Ls>@E|^ZTMIytL@Lz+_Id2<5bkx)uOrbgaq)XQCW@t!EemqAYtf+t+w)-L;muX#7Wv#4| z^8Llcn&htwu>hFV#83l#762Rr;PeRKMF1S%;#M-1dIDhZNa^68)riliuVR*Ii82gs z^(*#$AF|p~JZaT~CsWTkChtJS$ZOQ)%=B2?;$N)P8_?{kQ6hAU*HwA_^vwO``wnwVWgdgf&1>>IHWvoi(s8bGfU5c+V`(u=d>nUd@L zP~`+BNNp|X-J?3aK{$Wr)HeVnqewduHAGk40Atk#v-rTL87I~0QEVES4&EWYbiU$1 zV=)>FyR-}U{!|jYBJ~`c9)^trw!6_G8zs7${gtG-B8ow`NcZEXIkW&u_ht>JSi-mG z*l$64J>X``c#@=Az}t9jp>hX`mq`F9i*5TUPB~WsbMacX3||QTKZcCIs{VRA6)ILq z1*9~pAPA5c%~m&uKC~$H!bPB3Wf%Mxr06rMX>|q@vrv|m%c&8h8zp<6QpYH*cc|ml zk=Bzkxq9?4!uukueLL&(ttHaqwdu2KA)-Z}xVX!+JJ4L+?;%|YVu*RB9jEwTWdV|_ zPP^9TdU3>fqc?ieX*UM{rDi72giJ?^_QO_RIf}~W5S{p`7~7syn24S0Hp;(XvusRA z1C=&yl>)W4K7ClFMg(;_qlX+x+eE@XY|&&JJq_K>)cbsaC-HgwhP~vw3s&!VN1~#o zO4VK$qxaUOo|5PsgYsVEz3(GXk?Bz{k@jy(&?EjZ?E2N*Y#n@%v?^_bGmx)Zx-LL;whs*BpP!VnTRU#;u);KA<~ofrl`uxV3>CdIq^gS?ET4Z+@};VG zg>rCaWhTO3vJ`?z2`597NxoS#}^(g_5>Cc<*Gaen7hkA%st>$#n&^vspO!#KXqT6svFGY zli{fRO0YSv>=Y~bec6uqfDbTm^3#Yg+e|K!uhDc4a^qHJ-ygRX{9@gJQ4MbToR~Nl z{C0#%Xt9L^$Zt{Lm6rYw{C`-UYi!hI4#j>a{i;O%p$U6|R05~lqA@2|Lt5Z!# zAic(wdS{VNx4b9>&Jt6ekWL1fP~%DNuq^>Iix=Vzx06~eqZ-Wijbve2u6RXWaOnE- z5#}A}HW{$h&aEXMWhlE|$iiN*?u_8eZCuDHtI0oZ(*g06i#qJ85%q2OczqM4G24(_ zD04Kje_cjtUC;-WCQkTkcuNM2vSuHlE=})G&Qb=gRuo`1sl8|4xS-ZJ7jLhmRkGLu z%^NwZaLOIbh;plexHHcxyma&Z-(5~nL}0H>E{ykmxv-MjPAwPP9~<&^>#pw3^D}Q! zd4&CT7+WH2j{m6g$FUVt$0;bxun{mESL*jvA&LLnnOf8xF_@IwMftPQzp^4{9sCO_t z5-xN#OnePc=>_V6HVIW{ceNt`>CL_Y_jnB6*gjA-&F#F$RCG0l5NCp{F#qM_8frjY zC%|HXL3avMycjt1TKqwDK7Tp$vtP6VWg;Ky`x5V{;e>~6;F|t5>^U>{^=65NC~QG^s=E}jI8g~mXDnFNHKW^<1EYCY!)zli4{U+Q0jVrM1}O$iv}j>smy?nX71 zCTkxNWr^j-Q%xd$i3tels&xoDOrr#~p6ooQ%kuT4&a0 z3;h!GXN57d!-)dh$4}c8ugw&D{1~Um5b?7Z(N?DlX5r!UFblkervPy z`Tg9hjMcMB4}<9!MLBwj{%q`I*J9af^ym!!2g~7OXQdwU_H0>g)u=3PRjzMVfObOB z-{_P3x}F<7e`;33IM)aeg8NcyYLGsp4}x@hds@JvcG#kK`GbqQN79f~(Zcq$Th14| zllghsf&mLsGvRF4lKt2fR7L^PXM-l=*-4ROO-Pf#f*kVr)1u@QcY8})T|9i?hSK=>UD6@jq^g6HPd#nmZd!=mUw}q)R>UIe$i55qV)8sd>N%8zMrPq&C?lA|z@q>&2@tP}If3d6ysyVII}+bq0b{657ZhaV`RK+^e<5*eFGGXhfj)<&yi{6H zBn_OMIpbv`;Bt)lSlcoBWKmj%n~*@vcfW`sTFB-PSStf~?UPo?SQmjZ5@~YhZTK{YOXU8MH_x zdd(JQKyjN+k)!CHN$_>6iqPi{WLnI8qAtm`yaelrk@sQ?wv7ac>MOdWH+^rE0W=FTwt-HeL$xHY3J zA=T0ILO=W?uMa}jWXi})$8VaVm%j}S1J-Wv?jP@ivZpqtQ{88cQ<@w4W33GGExu7N zGU(;|n659)y;zd=+IxO)U9KY}`dno%^iq$z5eu<>V@5O=U{$G*)9TNT4l`b_A0i2z z)NUP!>wH&hyz{y2;KkoL0s7}_%;#RwheYFT>|>~t8z~E7O5Pue@Pj-vduV6rDQUR` z9hNfvT;XeFUg>{4cSV>XA(VRG#n|1^oYO|4E`Y&xv|=~Y&5Zg22tp>d|Khhe06#j> z>~3SQCh5^XJig3knV8AR*x~HEo-(5dv-f!kdYA+v3!00V?s|kFa!med5{I^PA+q%@4hSJ*51Oa(aDo2Vz%d z&57F2nqoIE+k5YXTJ#5BeT|!LYn({3{$QV-z8skk$fOG8+O+hQ z;owKwXitcZRk!0Jmge_1M!H0%?A1tY!u9Ho!WfN%Y?5^L(_ar5ebh0khN{UysyMFg z_h|G31es4BEBE_SvDUP|8hcoh@i;Qwncn#Mo^|{{`mTXJ^WxhO<$Di;@Xq0XDH z^Q2Tebv?UG!Yaaup0mwD%nTNpVvM(fkVWyNf+e7|UQ5o^y|QF&zj(k)B6D!-4|oD| zGoE(E{<<`o^?l{9qu>8XC*2_6vuJraO6>Gmo|Ye?gv4faW|r?}t}{>r9*>OEdT!R+ z*404$3xM|-UbN&#ZYmjO2v`aTOo#_-h!bKK8k$c60co~|0QluF--jfcXlLKztc z)NZjgGnCxW5iZN;fF(8;;Z`g@rw*vB4{!uD{u*zzdd_EdSfFKV)Izs1pa7>e(!t%Y zxsI%C>z92lOS~3B$sWuU9ewYY>ud6+&ARpCeLNDmEDoNFn0J)<%0H;)r@ZUXIi1onLfOBg0S zoPn9!&O9ZQM1d!U9wR!(UdcT!7iL47MPULvJpyc04s-p`6&SXNLeYhPmtXgXb688_ zwn!tL=JEII_En7$)TnpWgvN}B&KkktPiY>1)AfFWt_Pm|tMC~|ttuohXn>6wWCMQ=T1cpDcM88UIin^YD!sJ>^-9$_Y* zZKT4c>~$T=dxmm(IdnVRaX!o{T9;OfZ_^N}nXzd{xaD;H#dnKwL*fn;NV07pbRNYx zslc}>|CVlFBTV=8Zd?mzENf=UMkDiP6QlRj(9lsmkz!=t*EkP7?IY;a!ij96HCOrf z&o^zC`^zzw3inbbehdb81p5)PNXOkW0-QKi@K!}t>rqdircfX&nM!K>t*}$dd`%Om zbA`yb45aNw%PFb2k~EJOt=+)5cihE(8<=hZ1MFnc%cJD)yhm~5JfLv!(&l1btY-`At$Z7HLyk}o6BV? zPS&q6XZIyx(4Hia9H<6|U)l?Po2)}I__Ox1@}SanHn@97O(PSLJ*@XNacypzF?&s} z^OFUvOKaXo@_2tq&{Ge;FrIBKX%p+VAZ|muxa@kuOJZPazwotOrF};ENBKOiba`Q| zM9!T+*3$7C2C}xXa-o_J3keVQRE4i5&!5aOzWe<{F|SBxalJE#f2zItjJMmtlylPa zT_O(d^x_C{_d)`C?BGhzWqQTMW7&+D6 z#rw^ke7|6gqE_iWW%0q{e0P)CVADBoUj7BkwPviKv-;0(GyZYu5kPubEt5a#d3IMk zw)L){t?=}4llG^}VNw-g-)_WCuVW9dr z$=JpV{$aZ45EAn;oqNlU-#{$*NH@S;-^VF#fXqrcD3GP3YqeSaqceYC0fYaMY4Y=} zx0@eM=u_=p$Y#ZrP6k9}`#@F%A@9+ZutmuJ)Int&RSum)WxLtr`p8oA2Oy;bZ;CKM zBYlR7;n8qKU}%?#Vi)?_zS!DgCL^M$(Cej-lgJ2}ovpyox*1?suc35zKp}Bs(86PV zUafJLGqWSpm0&9`NS7wR;p<}Y302G_%83X=Tl4Bp5;KY&N7XW)zKbdb*u@dD+D6z3 zC{p~qP4;n)V`HLP>j#VIqw_e!bj*wB-1e7#;KhpoHY+9d7^>xw{$p1PkiZL-%`vz<=Bi5daO=2AL zM}PE=s0|v1~+hlnM_heuzAqKQN3jrPdjTIzdA^pj9Q$CD7?yw-oDx0 z)gcYT%ST*lb>7?b=~7m0Z`U%@BMzfhERD5%!NRRsC&jopq>+|B;-^?@X=$G=$d3Re zksHEe&?*`O(7i%12FBTutV+qhb9zRQ{Uyhf7nH zxspk{5BnP2llpntW+(r!hvTdq=Dz-{zN2Uf<3N!o;u49@ul`YRtti3;fmR!YZ=hhzyr4w(yzM7%2T88kF@ z=!I1NXWC1V;Z_H%O*&h*u0VvC1dnJ0)!4D|`aafN8p4}U=0(%ucoyjU^Bjr6wO8+a z`$SpyC|oX1LeWjgW9ZSJvCWKY%-Ni3Kymw5v*I$Kg(Meb7;9pP7xjR&!a!gat7341 zEie)OU`0`BxCIKH0fwzpdNW&TG0&@+-V9*q6feMMfWV#6n3qM!u!Onj=LFIyZY98k zR~(E!;Kj;WR+z~{YqNFb{V8g=@ihWwTmb7%iB;)7IsgQWZ~3ejc8jew#LWXlm?`35 zK>+h$31KY|mm&i94a(0m%;r2{pVY)cQA1+dSf@H*$JYY=Ky+Y?fg%RbafazWL~;() z{`P%)PWJLH`Ijf)RW+J3pwi(vrMj~-5tEGVpy$2{iH97E zrTp;@m%GvQ(M$Xm?==Ub9-G?nz18CkU9v>8j~-CdS)f2A z8j_i~V8=b!LY1qq*8a%Ij`c;n+)EV0_LKCDQtb6uv78ux=Y&I#&MVt+rS2TfOqO*u zTKz%7MxDLDd%R_O~d5?oo{rN9+=5^g;YMYpPS<_0UAI*l)LU?gbOIuWOn=E#igzW<+-W@W*hMP2IS- z`SK^=i~o3s7ZUeCXy0XU|9a+`CtFX?-XqnG3nU}CB^V)rEpNs9g})iT%Rm9^{X3Vp1m@||rHxII zacdzpmlNM{ZH)@>sILf!<4zm#Zn<8MOecr5!R($~YC2kYvVZKytJ)aZv{#V5D4;nN zL2ECnqmL&japP@(UsqWM_j*!nR3^gCHq0%xyGAmEi#uB^#jV5cIUZISbx3)>{t8Ev zr#c}_F>`w^ynx4_q(8WVl^dg-Y-I8G#prLSFPEmfmkhD<_t zebe7@MHqE8sge$?x4q0R3${s*x4%Yfg4ENqp`}Ky)7|31dl5N$U1EMC8G-F9&?ma(8^xbuxm6BRYs#BAksq8dmU)`MLFpUSCev{% zxBN2mjH%955jS7=>?Qn@*^MNIFxv=+x(?wE>7^Xw_ByZHSu^+ z9Y$SA9Aw>*YE(5kEjTaSXMbpT1uicE^O%N6$jG9RyqvaeJ>OJ*1a7Zre<{ zUc_SUw*-UHV7O0~L9s_7Y>xL);F2%zuZ;7^(~3baPMLcyMZQBl9b8Qy9MLd2E$vUS zp{wgb-rXute9~*foWYE%v)y%MSlQ-{{Ftc7@$$tsveK}Un5|!5i$!WW%`TCjay(>+ z4e$WZMP2&oOvJ6ek3$PNtnaFb`G1y_FC!BaS!??=6KO_d|H9wsFYV8`e7v+HPZkuw z4t?RaFxzyv^THtJ=W=_I!wu7CA2GV05yLB#=oG_2!*_jkaROBzR=fHMdTi81BqFcI(8?m9ey4vU-^UWM?_+>7$NdEiH?sF%qn09O*3S{I#j0Nh;7TqW@3nFdr6x0o0xjw!L07TbNy^5fg8Zt*7x75$#)OcD+ZRRyIEMy62$P=wgC?}Ojp)AoFu1}eFr!$y$ zfl3#DBR&i-=G8CjSmXVb|As*!zq|%E<``bAUBJ8NegP3605FZlD7}Q;r`4?PUAS`7 zF{$B3jRLNt)&~S0t_gB6L$l+i`bu`(P(nQ$cA(^lOza%B0gp`)o0 zx0o~NM!CP*WRQ$T1__WomI?6o+*4}cX#|${$ycT#uaJ}4iY&HQaX=t-bc@lk4^q!$ z7Ut5(h(`08r&2V46n2fCvI`!?)(c_4U17)cy7YqmqCSNQ&1`I^h)?C9OZi*Y-$W1M zE(%Xwu8u;1(5uP-St3(^@@xMtEHpD41-b>u9yaEnI?eTQcB}`(?!%LZbK>VuxaA&j zMzVd0_DII}x>0mSRWyK5{APu;aYUUCC9UH5Ra~sn?7<7Pd^j z%&V)9wkk^REWJ_j?X5eK{BvtcbWgwG!|0A)!o#$)Sh4$os(5woHFIVTAJqKeqQljJ zPZp(FbYbdc=Xz+Y)U$GT8HV_Dm*UuQ2m_D&3J%Bv2wt2qP^|c6NA6JihlsE6oO_3B zS12m?#}{r8V!4xWfoa5w^lGZ9wyE}_(vwZ`nDkHE*_Wt%Z!*>y(ZIl6%!6)DyF&$w z*J0{X2RkDd@FTMnj}sNQQe)x%Sp|rgD3NM=glcBSx9`qF+6E2manH~bAC+orIg0Pe zVS{33OSiDf&OUi&Z>fi^r^Nke=P<>%xXmzDLKKN|Q99=*sm z5bp{Lx+ZKkQkGt3MI~W5jHENUFZSkAt3ONAPYGBUW=VW=M(8l@>W1e156Y)V57f4@ zsmB?S`^Cgy&Ds=^+@B+rB(@FS<1Cy%?`xLzCs-`hte!H}Ifc#z6l$t&y_pJ%@16mpI9_V-vIP4GnX_Q(k0meAxChkvWCs|k_u8k;%VEdN6h9t_g1k@IPSZIv?hJMBbI{bkj)_DAKTYZ&hO+2rS0cNh zBOTTV!u`?%qY`AuS50m_RhdS;{^|%E-pzXLy%R31=9{aCDwll=q0Q~#9BZPYFma7~ z`jn@b6IM6;S@v1M2td^* z7wC#{SZ1RA%Q!4*mi)*8h8)W+=HJ=>W@_1Hr}h&YDFEiB2EKxTBd+0|LnD`_1+!3X zf^+V3d@dl5hmP4Dx@o5>>nh0q4RDTJH$Cgl<75BeRd0e zd74@z73~;CT(J=<6OEp^)-FVy8ag4-4|b?+3p!_E88nNu{QG79ZXiQYe_*yOql|7s(_~i|9AA8^B))*9iyWG4 zs%kSWU2oDl%OCGI_j!)+hoqExND&+F1dqpwu{~fF_@OqSLp87Fu!PIel&=sd=X0Om zMt1$R1$YZz|F_*Bl1!yGbXc&c!8TW_qfHISJ1Om?nlnz2ud@4>4CtUcT=ddECzDAS znd{sZt&zQ$0t{YN^pbK5&Dy9C;43m%2^{7JW?Iq5PX-8e|uRyN-pd+7_)V(YhZdM)Z(9UYBkKx1*Fijb0UW zy$I#PcHYRWECyy&W^~X{hA^d($wU7@ zv8pGNAzuQNKWqP+&gY(ya6*uQPln%EazOXK@5UK1JjL?~sVo1@R-2X3=@1NuAvPch zrig7 zY>=CW1m(#I+%TTllZl(n`+bigwjvm07(pRg9>dt1P}yz@4=H5n}` zBD3Zxak;8PhUn+m39$XHcYBhieFx&GPwHx1U-_0$ANMsKW>l*auW-?^#=Qu>4IxcC zE-SSb4{PPrdWgzUq5k`HK>9qxM)RCjOm5-%gwW?!;V7Yk{zo22!Sgf52E={R_Z=Ks z1W>kevJ^E44+l}>Cr&AkKshkRWb=k&KYwQxWuKc4r5&fo)m8f4e^9>H6$Vt@Nsm3$ z9TOofDg?rJRZoE!d(Rs3C`eyzAW35{>jgSpt{7w~!GY!AP3jC)8|suXQvt8JgEy$k zIEg2J!Q?c}q!9_A@8t{p@S_mk^OAj`b!$Kb`WTTj0Tmj>!P-gSaM{Xvq-|`c-Lrot ziAx?>o5Z_Jptd71%c@WhiRpYV#P|GU^Pn}g+In~jHXNthSY{ZJw~A6VKI5QFAXfex z$#b?9)oSMDk!(9u`BraUxSvyN7Hj_m7SFh7NwxkTdcG;;@29F?5~=%@v~uq63nC@8 zXf%?22$$>X48&(-1HY1azHb#*FwDAhuTQ!jtoPxb_W^k2kLBI?j99;4Jotz>)K^^X zvl7jCnuDLOpVX)=EX1KTQEsi4R&$Q?n1%CW6l*ulVQIP%1x-uzBAvv5^3%8Z@-;1T z^GJW2_23yorxmIg0k+ye6aKxo!{R|J_&Km7BJzXWuwDO^}9GvYfAmQ!w4Gb+7(26ZM*>l zcgiiW6@D{UxXhbEUpMTj0mWT>BIQtp^`ESM&Ra#!_clpj`uPAWNy zEA;B4qLpPWxtI{XN}k=dvnR(=S4eKuo0kfMonOZxx#hE$$$+>#DgSg?5?!h*uh|WR zQ0jMuA95p|UR!#}e^BaBT$fCsfe*N{l&?B}7_p&1t2FLnu{PgS*Tb4iT_$~y-5!a4 zk7=NoYhY1~5k~yL;SSIZN`?(y%+E#8kA*7#^u}h-yx;UK;Qn>-bR4Hx2Nu9+Vd-to z@xPHUWUY*$?kXwTwUuybkKBH?CLll@p#T@O@$;ecm93mA8sxwArHV>Y|E}hoeU8d( zFg_oUAlLQ8#FeK35@ho=dxpTop4iaM47;feyY2kuThk^g6k z1}sCu-&#((Bz$H=5yRmG)j~J{oR?Q^se)Dg z@8S=--|wdHl78;Rs9=uFt}Xyn7Uz24G&;pQH6k&>cuJo10;CVLBA0op6BJjqkkem+ z<~&^5Z{<6#kU+U67DJApCk^rTE(K^M3aaxqk!M8UY+V8v=R0Us(=5Fgn z1vXOJCasZ&7r3N$pWn{X?|3iI7u(0aUvJjzstIa|pUTX!1);*_$)C->?_JhI-M6Zn9sD%bo8Z{aD8^(KP)9GxlZ(AnCf1i`+MPtE^>-bd5Y3&2wMm0*`lo-Zy;z&##}XdnKRJFnHPyeBy(od z^)RA`+-A{rIVR6@UpdU4ui2nd!0j8~muwShf0eocvn05^>mF>j*NHEWOY2U{a8VXY0r(W5L;XPaSgyJi*X%!|HOT2)gVyP z%=#p8b#=T> zgWo0*bq^ENvzXldW9qF(Dy?!Ue@my@7Wg(|_PzHD#sn?hoJ&4*6g*jD-)hR{>Q}Ub z64l=K-naT6D+M&KI}1uKe=Dn5Ii_Ppybiu}IcAso;Dt6Y>sqAP16v#4Elm~$(GuC_ z(QJdZx6AFW#c12tTeA9azxssbccX!(mpnL6?#9UNhIZ*B!-KT0xga$gVYJRg0-2!l zGad&to|hFoq2F^?l~a;e9w~YD&YxRa4Haf1v&5zY`L`x_ePVr&DqI#PC=^{x@+`Jq z-W5%P0dOH$inW<-rosP`(|6>a{u4V1v86YT!nxTUL{e7jnfaHtFwQ5)=BzL6+Dm6sHb7 z#i{3qojPY?oV*5$orcl;2mUr_A ziTK7FpXy#TQB;6TT zrR~5O8CNItZg=|d zh-3$cLEpkoXr}0(`^@(paDIuKfKp>6P|p-Aq*lg5o2^Gd)sPPhP*a6>c@98RjWxAg zQO#Z*i1aRCgsdf1Q2x$}rI?^{L(8-=Cx|MLpfE+){Fy>kt9Mm6j6po}1#5^4R17v| zB?qM{X1FS?R6E(U{ev}D8(>u%u$HBq-3S~CnR<5#1oW`EiTjp=dmk+#A3v>q(FOAM z83q?$=AHdtGBuc0aua-P{;4Om(-S0I|Fb>QLw9Ra{=Xp$+5?;bdUXHum@PNLe$XUl z{ogUEGGf|pp6Wu&KcAe3{R@!)AUgraZN9EGU{r&fj>-sVxSqHIfRJCCHzQMTNK4Cr zh!yy3;6JFh0OgPIssEsI2*U`)Udv0*qqRsxbPH$|qO_JBjKDcxUmk|!oDlr=h3TK9 z`qch@Yng|*4q)3*f9P0$e~5~1o)Yf(UE$$0hBN+25Nn8u9W_6u+QQ^a8ljVG@$GBn zlc_mjWfoU$GXkH?xC7GE=w7ga@-IQj66+ZPw|6-`B2D`!LSdYz{quc?#{up$xroKz z^QvRS$f)Ibbg*18_JKW=a&J^dipYMRUS#~qJ( zo!+U`@h7VHe&DMF(|dZ)99KWreQEV{{^6TQeg77y8#l5>IN|RSll|;L(Tn;pKMDsc13FI%STMuJZ81og|98vEQ90 zFr~}`Gq^x1@Hh`+vBqEC+=IDIY1KJh0OR|}e^AKHrO>86DYxoOKV+l>+eOqg&!W>z z_mAEwexJ%#IDq2qweb^p{FjGC9v2jCaG8rpoT>A&DsXW}H4sI39p8})gBF+zz3~|@ zl45~4#l(?p=(D_gYpr8-UAJE;89y{ui-gn4q=9<%%*hG+JxNk=kramBccjzvr^U9O z!sDNywJJ)Q5LA!ERpu8~gssYmWrq<{WO6qbm?WbtbVw0xXo9R@NUmglxFUDAmW zH!ZPKQOPNJJZi!>lV&s@YgXTvR)VKuMZyecGkC07EbVNCg)=qSY>@0t=^Ru036LH> zE>CQ~$j>wwIGh(W$kI%DuFEWck9kw@-8(f%jM)|taAdKUjonqr!<0^dkOiCtQW&^U zN+J&tmP)lV(szG90}-Z#ptA4xNnLi`x0E|AfIGoe`>v^9e{yc{U~SO1H^Y`ynyH<2 z?HNcX3&wobH-2~*VYEJwU5A1*w|Z9L7-N}5EIu_9(!B=>gCmdJ7(c&E!>5LB>Q_5x#!F8ED1#;*H;9sY1r(_w9`Q`j z50#C)a=(z;)G^ZDOYf_BJsDixTaryi81MS}zqlIF_j026V6HRaSI$r3sj-T*@bX z8nw0jsO*@`-uDuf?u~Np@c)4zPt~u&_@}==l7oQV9QkYr3R<6TkZ0wpPZLm}NOe6u zRpt1S3J7b)%tLs5Vn^)9L^7p>7_erJREO0W@7JAGYx2s4Wx zhNy;BwjLLJW2!#AXX0_3emy3(eLLNV(Dz_Z>Svup6P!b1MO6#o&ZLJO7Zx0;+j*&a ztfC|vUz$+jt~tFllKhxvx4W{4eY9P8$0<89J(L+?2oXBbmM9?%s?SVcnhAua1nt-! zecVsoRb|9WnUOm%XExkbb?=L9w;13#9IpY^Fj+ske4%e(%i;;06wG`$LT~J8z|+>P zmG!2F`~qKZKRt=D59;)W(Ms{t?_#hi8q5YQdIoCMC}q}xZ~1(lGO-R80*%}eKqtYOG90-UDz4%FXXuybv`-qPC$tcnCqLtq&=vJ(l2v9PbL1Q z!VwoJf^$`6ACEMNIx40}F&`PGth?7+w4wUup!{9)jA9LclT8#amYlt>gWh1JC^!;P zPTyJmm#COpxQ$``Gdr5-60}|_engKlVD*v2$pPT{O2ikOp?4R@ceq*lA|;XOBeJ`Z zuDdUydo4FajihNyqz%+zOI*aZPqQp~HDMpM$~jdv!1nSCzXDwD?ScQJ!qv0b#m;Qc z{CR^5bh2Jd{)Zye5^&QGFaZC8B(LiK6w|sfU^dkPDDD3TAJ~+{HYFi`q=_6fFReJe z-vdf9!u~Wz2W+xu@c%gR65mwm9dql~yuBv50=kza#J6kJ{xeZg6gENV`NhGE9oq~! zOAfpcpV%)R!tFokyFl~Oji*_ktM04NhirDbFgniTN_%J~_E9J=pxpjEJUR%K#msz= zZV@G`sn&bVbl#k+ma=8CIzNAC%y@xc)?}*df~aiuyuOU3q{v(R_`pIO*FLOwzr@zo zT?@G_kg_DofrY<4QQjywpIfMt*)k91{@K{r%bRv~`k8omz{pzgrYOpxx7xky^NNlz z=G!w7X3c8<*ZH_45+A>Y;F71=J@vVQ2(!`jO8ur$ezW(gp}TU>L&CYC4!(Z(DL#)RUiAFkK{)*;@md9}!1n zXC4tui0u;K*yjvSKF=F_5{#6+&_p>e-@kOx|43%6Kx}m=OVLBRBM-LYvu$ zJrA^%^o-Es2J_a9K5=Q-pnB*JoGv>KtIs6b>g+aO{;fhZW0N-oi`-)H9JY}VBL^#V zh5$(El$F2QXtA!Uj98|AVZ*{w1+v%14^$5QA9USnkfpj z9!HXtMC8&T9iL3+`Ko=vSAefsR?$NBl=z+@BuuNe*ltf!kkQeZ1n0`(0ODYor%AqT zD*5M}2?A14vOZ7UhB+KqLe>5l5Szpfbj`VYtc?2hmi8^W#glV`lgi>Pj7Xlb!zm-Wf~J^U~!@JVE!bh1LvSK4Cv_MB+v?H5Xi zp_Y9<1`F*Z?>5J~0LEk+$ky8Kd=bT36JINrycv27nwk)KUM{>^|@8 zl}^UoJtD0PcSpLbz-t+z0lv8P2|)B`)YCaxd^xxKp$vy}s4#3_%F<))Jk#c^S zm8BY^GJeay=-zg#)FaZjcEOJs^flw6E`7Kj_?j$ivCf?yb3P!IHuo6LPr650Uw`_o zt!cTY6>4o&!D+(_cC|8{ZzFMkQol7tK_Lbkl+h=6U-ri#Kg29iam)b!{m zD-6r+ScB3*7*>yJKg#~-3bB*RSJ-sukNGri$0xk5mG%^IkMViSh(wsaux8fGkyoG+ zx@Uv-(-8^ud5}g>@2k&Z;+Q6rq@$J6sVgK?))LW|Dbv?G)HEL}pli0Voh%Q{O20AG z6fK^)(=Ppo5Y%NRi1A{1$XzHqA-pf$`}ajLX-0(4@1ON%1^-~Yzu@8V#(d}}7;Td! ziA4CLi(nP2*gM=wCxzyp%+DXgiCL}HV^2e)xSgezG`s%ve|+!%5`3mWu$D6{1K8+x~5ZOZIQI zp$_9^+wt3wPX`akJ`HYVpm-_cY{CaB8m^opS>6lFlT~vb45mVWDvx(G zSBStcBn~Md=qI%zItkR>YUN1=)`dJZ+>y`vx4J>c`X ziuY-UO8V+Vu|xL)6IiOWW`v^fpHC)g3%i$+7~)*;cFZaY)$iD_%ccl&@+= zF%Iw$-a<(fxwLPHPU%Img@ksSC_+qP>FUO?hz*EQq9)}lI)QsK)` z7CET;^k;%QepU^tD003Vkdl!C;VoHDg?IIP2xnPuPD@ix!&}aws-1tI4y+zt(P-C? zV07>SX&=`%e?ssIFUqa-38UUTc@v18o>wWu_E_X+WQ-TQvMUGKVyA-!Mvd5s?J>r$ zm_L3PSbzjgs#sY~vtw@Byakz;AeRrWBr`!{9a>{xa`0Kv#+6DXyo!4HS*t?&p^J@` zmy2-53Bfg;)HdiT#N<7HaV+ZIB(?3392o$C@lG*&0_=?&JXu6uz(O)$z4 z%SqN@euk-$`W7g;Ge_=hkP{LGMT^3A$Rve1izNPL1XqG^nUk0l4FqYW!KU9zyJ%$V z7@8JSzt9^QNIy9EEV{7U!x47ANC=opk9y!39!bQ?JPfeR$>yy0WWuNE=xDs4nzrKq z?ayW{ZRAH>q1oa{5^gA?%B3aB&LJmEzvsna3>xA}Uhv{&?tx`Xf^<@E>iA%^d2gl8`B|oB_^jzn|x~ zWO8=QiKntWsFK`Jem#=)EWS0bpAy9d5P-l5(v-m%iypFn*8qeLjdNpB&NwIgzod-1 zxiEf>OrK}qiT>GUI2 z8SOgPFpt`B`pR^%vJ4JcOt!~Efh8pl1A{#C_hguFI|C<%ZwDP;{gt4i~Y z_pvJZl8y%tgUXC)AW~>T-oJRlZNJX>A7iAe2KJ+Ae5|Il#SUjZ|SJ!-80&L$BY zdNNstCpY0>_-MeKH{mmtbI42#)zHb%@$(YZPVqv$t&4i5_R*HRbfA44--&oA=@4ym z0cc|VAH;v&Lu->Y$fM7+$Q5Xf`bkG z-$aeYf29ulvdHe~!S%Baq%)}7Ybp|w#{04u6Jewj&MoOQ+KxGWKOyai+-Kt!p62}1 ze?%p)!l*&w!PdzqooqH&bx9PI``Lu-Zrnw<;m>vyVG9+(YT)&Zfl$3~!KM#Vd?0^c zBGc15UL7OwTXVT(^|yy5Nm!}ZN1k94?cD~)hw~WpjF~70u<}iT7v{NMBd8=BX6#nE zKB13GC6j&3@Np%T@YXfHn&>}sNS*N=X9i9+Sb_-;Gq1g!>s`mEZoG}7g&qlurBsW| z{gYA()>ml*0M|Go`M@ILzhw@$Myc*S(4W%wJ<| zLOZ(*!QXSxJJ|?C2k|m4{0H@`#cbmvG(xT5P1? zI)rkd8d?mW3N|x8c^LKG7BMF46SX}az5DlluQ1T@t>Mie>LHA{m3H=-utNH$mxAw$ zzNPl8m3vG`5}91Hrrj&Sd(&S_a>I7U@kS+_URrMTw_LfkGYrVrO0j;)e_<|vIbUd4xHExx#EpRLJoaIv0trw3D9YEL z8n^sMi(AotTPVj@Ixj7)*7sZ7<|#H*7P{4_q*v6GI9kvwUea6H9IHe{XLP3rBrN`O zB9bkNm?8N-8iq*GH6>S2Wj*3Z(B8yYQ)d!PY!X&!vwcJe+#ufdwgVIW;|)X~(ou}h zl1N?L7ekw^Lc-AYMGk2`jFr0Ffo)HV-(EOp^?JMLQh^lHs8P0gjka^{Gd;@uOmhWw z`Q{(FO?|!Ux%+!12|5h*m8P3p%Dx@R&1y}UW=`AJ18$4p#}_j^2$My?w-}FTU&4{u zh21U<t`*Mq{R=T_;sNoBs{&5op@$fg%g+|T}5*73su z(_W*f5!s$2$A<1E_NYEZY1TLXv4gaUvY7EWt1YX(X#LJ3g06;%xZybnt#z>t(%&QI z+Ogfyz@yjpEK7><1O+@06k^g85>H|e#0nof>r=KVM(miTh>F!ZKW5AO9iA{+0)T|u zu#Ac^%#-nVPY+dr%t4W5%(ozEMvin0xek^r4r)lgx1EH&kaUGJy_|h>US5i{l=%q0 z{yuRu^3iOmELJrvxn-3s>7u!F@cZFi>DjLxnIR!X^Nf^S(CRuTUZ>rSKt#JQ;iW4>nt6Y8_wc&iuVVo4|aAM}1JJ^AE=5 z2)_Y?cEa0t9(C?7yfIUWUgAE%23(2R7BU;jmv@N8RhA)M3tGI4vu-PGT#a&BtW|=T zn#Rx5c{f5f(K59IQ(VN#^wB@i7xq$5YWXfCO7%j1TB*bLUN>vuUwkW{{Yq^PeQNMF z+IJ~Qh#ky8A&ts?lfnVKR(=}($Ld;Qq8XF9C`+balx-jdO zVuL?6K)Mn6HY$Q$j4X)5`HM{Gg-ATH>m+FAbgYw~mlVAgQf=^fJm$8ey)-RvHY}JV z^MgAb>zK`UA*Gqz$Fk@eBgD;KY3@~8wmp2sH0nrNXh1}}f%^li$(N5%I~gGkvBSwz zZTQ+d!0c=T*lsH(GmAO{riKwi@1+s>T2Zp}nSHKt4n&~x*0x_42YC##4h$=8Eova0 zlein7$Fp~4*QU2Pq_-4kW==Z+!PT?+ONw3gL z9$=UHV${1RU-XYn_cz+XBX-3l?pCX{oL8wrK9_!il2>u6AUb$_IM1{~?K!mUOR!8F zsT4dO{@7R@GM1uu)*lD>$%Z3Y(u0-84&--34MEFhH1JHU!R%XECfj$4ytLRsp$JOM zsEbs~w4yOjpMJD5{8PQdSy{L11BvD%z5%_Na=$$4LPzpmhI z4gYN#^mwsD-9W+ZAONK;n4$I_wwV8pi% z$09cWZrl>-wpEWi=s!Uiki-=-S7#D^R~ah8D2%{xNjTi@USy(Q9!plBjkcNdD!iUP z2g|z0*{{oPt{R+L$%jMFi``G%$0%vGQI^ekBIEE&vkw9ZFQ05)U7zlr{JD-_EyeJ3 z^E6=-BHoT%!d$(sblIB_`c0SHP?>Z1VJ|%5tl&1`(J#0`XSGB@YP$&8xF~!&R-(tS=X0e-(4Mn+~1b#x3@Yn*SBNiwKk#SN8T~#^#ya{Sv|b3VTtfF zvIRpde>2)o)j*&%Yx)DC>XTjUPb2v`4JYzsSfzWy9RYXWEb z$*70_5yJA8K{;?B>P&rkGe8FT{jN^WwVQA+?VYK=M$FWRODhD2&mL;poxiahy^@kW zczXT^1uozCZ0Ro2pN;-5ACREehv<+r7qm5(+U5c!5FFJu_wiE27)T_rbCoBesE{vm zzxJXLAtkh!L^aFt!9|)d&I)L}t*4GPFGcn?kERDV0z&HoG4nm?K4a9>w;X;mnh`Bw zR>NH!dXLE(O(Q_)72J=a2~5zfSFpK?+P%3|9YHaN{UH(r{uQ0DxYw>vucKf;Byw`9 zNWfg7P}%*txU1lM!_TAdnUb*#+rP&xZ{>c-8$+O%Nbl&zF1cyEtwT?iY9tEJJm8a! zq32VF=KLs%!F8XC)GWZueB^jk$E&T)ZXFzwe>OMA3x4YyInD%jYYCP6iJ7&ckx~&x z3eQji_YSn-R)i#f1K5z);?|eQmsd@z^us_YWP4hW0V-AsGnV6)6@a{>79NhvS~KkRzcRCs7XORuWc~S zw*rysTIeDbDouBKQub+YDR_5-XH2LEynA0|uMP_;g`X&ebsRHol;#mn-;6?h5dQ9< z3CP0eAg)$Q3+x*uDA#@nDJi;@U{g}BzxpyV`p>*Gq@a+_=F2u4zl7h`@Fjp(r8lBt z=A==O!1u+LMXZL0!4uL2==Yn4k#?ebdCLa!oZoZ zGT^nRmw2iYCspUcChjQ*>{i-)FWoN*N~y8w(p_qu^>CY4?W|OE=^T{eMi+ z*J^DMQ(ZXR1No&5Yku5~otbwNj=qGYWnbg9VVVaC;?ezuY*V{tDJX=enkP@rsu}l7 zVMDBpWu}Zq+x~+>vK%#YL@TGH&w;$>uEFqa70QZmI|UM}@6zDNZkd+QO_uxk|4e57 zIsSUX=lUj9?QVP2+My996TWi$lh^XXrl|=e(C+{jEsl-FWB)9D9YDN9Qt?!f6LU`P zsi)Dcvp1AN1u}C+d*c<}(!C)Qrg2eSaTn-VHN1P|Z>#vG*W>8v>h~~%yZpL_95#s% zI8wghx=We@&3(;Thi@Su=YagQM=;1YMPeJYAgJtKzBW_@jX7sdH`Zhcy`1-%ApNJ< zKCut-Sc~`S?5CqvTKZYu@If(XMDq(}60gctFObxnm0%6wYw;b3jBsrLMFK`1=M)i2 zDdNvs<=hb551ALui>%#d%dV!M`w1Df-#45f02e?Pu6w0+m)efcsE_HZBe ztsEz+?}+_v)9SrhD)+lqYERB6G*F@vr0_gQ)3j_u9d5IE$khw4qya~$5jN@z8}i3$E` zM<9pt?<(M4rsyNZVl(o?!=PP5;XhycBF?N)irC=x3sYX8~+JNeu z7WNCQ4I^q<;bn&-#&E;l$Zy7Va&YqK!NH5}K2lYjJ5*jj6Q{PnKUSLqog=z%q2U|7(2jsZau$D1E| ztQ2g?Y;~q2-C8_s#s&H$uPq2ooX@H#+jHJAz2kaN#$Q(JzDBo4xVE#8Fu>wWx>Zc& z3d7TG!HjNgw)))d8MZD}{Zf=v z?OOGAbcrc}2$gva5WZGnUQ_2A=*P`3mkSV!(V{TMeyU2>FpnJ7kR8GbZup^=q~$Tp^!Lq*WP#xZrgKqcNCzOD8cL zRNLzB#+10npZ3Ci#RKSm-g4AQZ>O@{P+6DF$~L&H&l^#9)F_}`D`C$l=Al1F?c?Gp-^0OE=mqF2mkr0BCHLIl;|c~#(o zZA;)_@^Q!??{RwQdK?i0ZT(wv+Pr#(>%4D~-g(T{xHi9%FlunZ-&Mb&XfxUGG&Y3t zl0yKClIWkj6=|Gn!x|uz7F6{DMDYP8;6_RjAlb=Ju`K;WOY}deDXaf^fN5HspLFJ* zjQ@k`FK9EqFIy5DA{ae8y}Z>BK?xkDiE>%4Ao_FNe)J?V*na$GcM{bbo~mlOy4Lg} zE%G>0mPGH@5;`qjVs;F1Rh%OqC`a{rfBYB)pNQW~2+)7WM{C&%Ii9WEsI4l|^I$@E z4v{2#`d7gV!wDiB#{Q*2pmzJ^_<3Q>7vwf+AH0u%AjPW|rF0K)g=LDfT4TX@O(@-; zJ}NMBM5Z+jXIflT^~Yz_=29Jomwi#_&y&uXYKI6x^|5Hh<%o2dKt}Q8NdQlMBkh{p zC&BhO!E`B%ViM(>N&(eHcOj1h)m>>Py-#pv$QZRRix*MbC}G2C>d|XMtDFaGr|(-w zx?>n}sSn8|eq9wk>hT?`phMF>@3K>$(vjOug)Ij0wjjgiwQ1y2w?HV$q_+7js7z+l zxS;g6g4QO|tPJnl56O85H{8h7Uo<7YgHe}dx{G{$6S8sl|0pU^d1)dUa9xd{Q=v$lW#W3-QzYU)> z6+Ts^zl}QTsvrWKUQ`38l^oWrcmgEC`I=G93EINfR z6wol@C#uR>#y2;v6?tSGg8_xP?s;s`;@;VSlKzzT%AmbhX^gvO8orC;+Mp0dD?Sq3 z5_CSuF>=@{|JQ(UPrQ-)$twJtmI|CvyZJUd5P0c*W*gTScCm^bBknXsI6QO+b+OuY zX>~nel=^ZwrLvdFbsIn<0Ygg@x-YNR`bM&y(3Duz>C6F?C%F0QV@RQvw6 z9~7w_6xkPk^2Sl*pQe^T>mc@zX-$ z@KL+ju7^%&iVgEf_~$Nbr)9GtWFfpv+k`f~ziGDK4lKBbTF`hDO){e#uhku*L{iE= zDel2EcTG&pmcA+46QFFuD&Fu%Ic@ z_YlwN+~OWd;sr2&Y#Uek9po6ZYxQp7PhGh^1W;pmc}1BP2B>NoS~`R zCm25af5!sJsU2KO<{kdABa!AN*h%s;-jn>yU{If>KVTuaV6MzCmyK0sx=6soWF6K} zH!P^ORqNqq=3Ejtb$|-VcbDa}b1GKe+>07pZEgFC`tO0e&8$f)#^M&6f8X2%iFBx@ z>~=%)AunEjY{rX_#L zoAacvmWI8<4QGVw*Ep=8L#XESK{Txx$RCrnHpiF5PfZ>|;Em2IV$BNT7zDOG2R^hI zee|ZLEH15Fehm@J3OEVkDy$KEyR^5%XKpANBclqg`sedqRQWJZNbymVV{hegV$8-k z%PO^oKDPD<5k`T7A)_R*YMcFY%~GWg@+`izCh=@Q=C9EvP%WQKsthx6@-4Xa7VYp_=bsY$vlTeMP6;;B_X9(uN$M*4*!D@Y zA1aEG6JD3WNyA!wfjBpkRCjUM*e;ngClG zZ>6MOQe!q>hXXw-u`thKQ*OU3ht z#BZTwS6$|9qpw#k07kP#g&O>Q2pr_( ztmG{|=nuv}usp(ss$Kb!Me$rxMmhkM310mFNIJ`?HrTFN2QOYIv{1B!;!xbZxVyW% zySt>p9fG^NySux)yB8=tC+~Ou=4Vz`_PzJaT*E;{7Mnvq7XVK}FQySqW6(mV^2OZF zG9u1PS?sBrlp<)7|50jwP5HkD zD3L~LXbA2*> zp@Qiv(2BAW%|XTTeS8J|%~vP6W&U53?fQRq^*23|u5LyBuhwuM6zU|8P*2$dP`b3h z@2jgHmLTgKQB##OMEy5kfn0^L(`%4ApEt+w*Tz8aSI;ym^d$J;-BiU8ldJS=-*{qD)9^Zxw|KC#` z{Q-2v(6-asR{VkZ{qDT~0QzGc-qT-nmE^`tHxey{CX>vT*Yz{^z*0?veZTDqE-l;Q zwHF-cb0(E)Z&Wwm=b`YI`V}M|ayy;ZfDY%iQV)l%foI9$P>y>l5NVH{&LfbIcA>KJ zW6~CfLD{PW8|P$l1n_kR{|VQF|#%0sT+ z$7^-$#WcbuY1}L4=}M~JaS;r)jbcLY(al&S^4GbR^}EMmohlkhV~f=NAHJ^(1Q9?| z6E9H8isgofZA9xE3_m#NAlHCIZM zC(}3-xQZo)~Hvy3(VLOD}gwr(3UC#XR$u@uf3;^OakEPquGa;0tEY(%u? zQRkwKH3dSKe6Quaz`&LJ?MglkPoJ7kIoXESdXRe16>qK7`;=%vLFQ@(%jpOHEP1N< z;UKZ(xapz4^JYw?r|w4~d;YJr>VwGQ_}{fo{RUI)-e@E<&sD(#Ug_5L6*n6Q4UGul z={;hefOwYH$jg}93X-kgF7RgZOzte#La~)os-Aq@kMGut<2O}_zJ;B=*yXz!KD#E2d}%o`;A^` zhr$CjziqjZ^JvK``+hSVnyqClYK~6y8Wr>!A=8>E@iRx9Vs%8xt-x~`>5=GIlwb2hKH{+nj z=;h$iz1CVhkL7T>D9eQ;E?wgGC-*pY8>zES1rAb-)_>*vPm@1e=dVJ_lQfZ%|73mI zcYLbYMmn3fuDBT|oaH5jqYJ+_4j#FFr!9HIVZ@GSH8tT<67zt+*XN;QJ(V5IUx>1p z;*c@!JC>uI=mQ!gaq`tBL4Utxaf3e>tnT&$_zj<9mKxXyQf+tQ%$NpkBrqx8IFALA zoEguap1nfLa^j`k<(W*^dtO>l&$DFp7Y~*4z^a!mnW}hOG8Z=c)}E;1EUZ5S*Xs#l zDphZbN|LuQE#r;tffA)SU=OilG+$9`!BHxzKX);S0#f^aG!gnTa}9s1m5A;}?nLlR z6U98w)1Oc057l_b653mjxSP|_n4#T>(5<-ic=`7~vg2yb1@18E zVw=rB#H%|kSLJSh#Jdi}WT&!wRv3+cCg+%JhFfdcN%14Gszcb5gS^xFv{j4AQ4q;R z4=K{|XvBJ}wnVD!7dj_@!$wB4g#`CrSbMQM+e~$P<6euNcsi+>U~{CoT;ziv`OZ9w z)&u%0m7BzAkda8AuhqnU$NHlbNRy|c|9l~YHr7<$kvaaUb5Yk@?;hLNnF8sT%Y z2q>xwGQuh9Y+HEMAV8pFnXtLIAygi5e5al2qSW!!s>*2o8Db+rG-Tb;+$!xY0=S}t zc&RZ|7p_@-W!lI;JE`g)&Umc zMx3;d@2T^ctT12Ymoolo4oZVje&1tQ2^tJr=^X3@16<3l3L$Rg;)RSQ zLym%2A)}>G5Sk<*NW^y20_Gy;v1C=aQZn80xRR0C4nBH#zEL5z!{E?qahAD?TUD|W zYijqo%!VbK0-7vkBK_WoW>eY1#OCD9*XzKA^AM%#pq@?#h{vV|Z*#Cw%w{2oDW@B; zToD}8AaH#-*ei}Fn)qIz(Sw5)iWewl4ixycFVA$p6#JREC=gZx&n$!7yEjjxjTr)? z80q$01Sf^}s+=qX`su%Upn^;~g$~fzPPi26EkahlnnMEYv!V=@K%WZ5pjzx-T8Yp; zGk<8abxBI4>=Fmm4Gdjm|Bs_hpxVxnfaD_LF`RNoF(XKsU8xdc#!j5XHdYcZ^X&Fv zmJ@NwXX(PA4gTODo`ecizLXz==^lK35Jxcu~=hTEDz5mW943 zxq~3*Sse$U#8`#J!tq7 zllI4)GCI=MaguxF5G{AJ_~B|E`;Ag9ULOj=BvAv-h-IOL=G zB{hDp${=%jiK2Z`$WEvWG5=2V&4u^&ZqDC zcQH_G3q9XpTzt$a#-Tg35JsmW6 zwh9J26`wK6IU%!MnBs|Yd0SpIJ#soWZm5%;NgG=|tJUD+RM1n&&Y_|IGba$@TocDN z`%Tn26KO125Hk_(5!b}rBpmwB`(51X2eYgt?huh}zCu;crACG$3E8tTwE3@`Wow;f zwX;p~c$m|W&>NOwiG-RE;4$r(UFl1}o4cmlrt_Y+_aV%i;bOAVPg>>}^Fh(ZNndfL z2gEHCs^KnCY^M)g<^>;jYmI>J#N)qObd=ZAMSntVUC%`jcW_{qFR6-OoYMf8xj`RJ zbkV}G4qVT;8PqtxY|R#`QM5$~t;+PrxuVGUR)~XngC8n9s_6C?CVD$&QH$e;cQ(Jpe?3m$)COv+!6r10vEFN7AJX=Yec(*Nt`4~QL&Pg`U&F}Vl4#umlfLD z+oe^SU9zpw4{r)bdU{5nokN}v)e^I=JS(=9^lUtqdgnnhmZ|${#-m)k0bEtg($=oC z6zO|kOdZ-$MhW%I_M0h>Mt2MnuB^7hua6>48s^tD%v*Sd$l<||M3@$mYc*Hup`t@! zr|3b<&*vOkAKizn{{f8a0#nh$kyclTk*@>S3hJ_qVUk5b)TEJLw~i6&@r{%HuY%XR ztTD9~4)3+7tABVW8l%SMK4orB352-gVuoKDw6wN1wHp&ihfX+LX(D<6mmT-#JGFwq zE1D^j zR{jmr71v${&ePBR?JXJz`t+}GT;O@VF8IuOeyf>|7cbJ5Fhv>J9`)Kb+X3yaLg0sn z+Q3|{oqIWJv6w6Wm+YH?w#lM`x>Z3_%}mY#eKO+#GY#kP6D z1GI4MhmTd$;%TfzZs(ULWJ!v0GyUU6lIy|$-f6>sF%+X8JkM>zo5Wi*AE|B(A6h7%KB1fI|7^3ed<~dQ~1RHqO8w0uCi8&v7iss_teI)WA_xzaNo}T zv85KMFG^G0N3t?@I}-Ft{y5pIXRrapc{Qb# zuUaRQ?JcH4c4NWNCOemOQz#>-akj2<1PVXVym?(aY?M5T#?|&{*~*DMc))+RUS=I{ zVwS0o$L7~YB3BPbW8=n4!Fk}LR!fqSY|wZ^o0$k>AR{WfZPP9KNy&Ls-c*e~AXCG> zC$M_1N?&|mz_1GMQ|RBW2cc++9H#1At%XlD)!9+02WSUAu&I&3`Fmg-RRGKc#bK;n zj0R0jW8Ag=_#u&EEmfqh!?l#n6_FqKT1Hr5S72y51?LppUl25x2Qp_TY;tX$}G zqz=lXuQlUW42xO#U1>L}i?o7or|16E)XowFbe>>`9o05VAWg=p8My6y-Aj+Hu)Gmv zg`ilCFbNuHgJF613)Ih8?~KOi$sw*%NL_Ti>bCo?&)WbR_F&km~z%x+Omo@z%CO0lOKj_pPFx-#Zc+5Z^y7!YVg3+%v_^5jv@)Oh6cDp(Xr z1m1alk5v|gmMnRceybFt5&a+S^#2BTQRz^z3yEQp(ZMKC3j%V_$1n4@F z`#sDOGc;2?JW`horx!o2#ja(8HpeDp1fSFXU@>raZn|J#dTP}%;j~5VU%+q+jZeGw z&2BkJ{|kpU*-=)XE*oAD$B+G+YW^FjpaRMpkLqT@5ZGkfKBe?aKz;=KpRgToeW&qJ zHc@rp9RgrJ%X+TJhqZmQWhQkWuAaV^9Hl(6hIRbz+H>3IDkd~==Xx8M7uWwSE%<|F z67N)8Q4ldrDB5F^>QwSo>0uuyz>S+kOPJr@3)2TidI)CvPy#OZJYA1F5bVmDHhi=2 zEkP0?f0EElP0tPEVa|3H#+nd1JBJPfcY`!f`A{V_=LV8qWw@sA=!F{~`bjw%Te3c5oNGU^6a#+bG+Wlg#{#R} zyy$z}t!{W+PZL7ljXG9|YZrqlNQa{MaFh_Y>ZTQpXlurYTA@( zuMY!_@0$EO5E-o_7U=C=Xah};V93fN3gdZSBIKO-#Py#&C-HnttW}cO+WfsxT0E6O z*-mls%)Bi{=@$FCS)MP#wY<2*t(&o)e7@&W+nTMQpDWG88m@OeU^CrSf~F6oxU9zL zy3%?&J5n{M;yH9g=zHZkK|E?SgHE75#I;(W zefSZ6REGWJ*+ACgLT@o&S0+U!U4rb<5E*sUSe>0NsJ{xg>)9?bvkh&vr5!Xb-7uS9 z+t#EB2}|6cQouGk!- z=hvb96{>y=c{84C*%a2E)K`^0598nv;I|FBW`vo6zv7zK@l)l`XR+H4`(m>vM3UT5 zS7Uoj3D4j;?O*7H6rL+|Jjk>d#K92rYl<_h=2GJsB?NN-CB8u*T$Yo$+2AH+|2e7i|w_8IB}KA?uI~OQu%G<|AzIzky}f8AzCJ zaojRu(O2lAE6OUAVt1^jb36|^={2@eN?|y!P4fDj-ZgQr(%I6|Q1`WonvP!u^p$n3 zj-R`pq$8hPomengGsJO-0k6U;&H?i|44UQUwalHiMsICs@_0B|ElAf^ffbz=YVgN! z)a=5%x#!QNd#9ecQF1lkS+=F}J0S7+n>O{m65e_SOlhBuTH6J%ypl5t5nmBCb zdu{=qP=XY;P8jb3xZ+OIK0qC~l=Mrr?N)gNSqaZkbAFA5W6Z(ZD1gB?mjHaPl%kMt zYea5Om}WM5Wp&3ja`Y#9C{ZbXYtdGNxG@tgpZCUU)(})6X>kc;!;kYaM=WQX+M-e* zyec9audy=PwJ;nKWh7b{keEz2n%8zX9Yk3ATR?7>03@=apkGmdujor1p35`Z z%Cv-gV(oF{iBgBKGbwYxW65Ov^vDs>p$P;I(wjDii< zEdFwNsb9bIjV{}&Fm^eSkU`8{p&rNt585E6bU@W|mOB0v_g=&`IDBPZwk86KM|dbP z((QoUS;8!$c$5rE&&{j_x;|oJuta6k}YCmQDfqKh#`Qo%ggZX13N=Tywzvv2Lx;7V6S}> z$6PWP?04z-AHZiBb@%doPx~i6)lX7lp_5JVV?DQTR?ujS69QgNRJHzm?KRrJuri9VodK7~xS2Bf+FJNRmoI-;4^$)qRUQ-ldEIV7m0FIFvj(b=p|=&7H|I9-ctzm`?ZbZU0}kiSr8FUe9I$6Ew8E{-9HYuvZ}d_%TN@E@BZlH z39S~+d!sxPMXZpYx6qOBXS@@LcgQZ7_$i!uRHL+7Um{-ESve8N4W zVbw_y@nGG*q#=LJ${?t)FjC=RH-?yl#+DnpFUN6WXj0V9{3faJ?xr7+x{uYT2{F9) z8~LQ_YNnPqA>Qgk%H^EWS-(`g??bfeS8lBz*syYudPM5iceHN9; zZ}_n_dXW=;7_9yXmwDfo<9?@|=hfZ4EfZW`s;yqz(_s#UAIo%%xyIOzsHXP>R|AJ? zSIeT~lKTk{#htUO3w`}iaS3JlP1`nZZgF&}wR5WppO%spIO69PU*Muw zP}?`+PPM+A-|JPcDXD@iOfB(0A5Ux!Irv=*Ew@fO-EKvE6(USXPKiYPhlD@Pcc10_ z!DhxMh0maRr6wLCHl`>zvY$&C-^q7g+S7OJXzF;_m<4hD5}8KYDj1Lh_Bl(lQ*9>J z&gET%S~#ckwYD||Po4U$0JxQF?yE5+4Q~C|MFGcn{pSZn_ zI#FEqQ6-KO3+a(I*U7)$Ga(9Yz%0N#RA|^JZB#grg4^u~Z`Dl~$vfN$q<+sEMLV9S z*W$R6d8hW(;@h^XS1&=%qKT4fkhatwA4?7- zLQ*vuGrFDx3ds$~+KDq`DxHdL>8!1x2^Vq`8E|*#&khQO!DnmiTdxwRWM-%Np+l+)uz4eN>#sMdTr zG+3$5j~-5-kfv#HTlchiRdvbS^fQd=EZb*Sp|~E)(RLSJ7e}lJkWMmPvwEz;w92U+ ziZ^(yM76_-YLvitz?l@2WF~YeLL{j1nqPOfGKnqstc&dru-cM#5(H8Igy zie`mNUhb^Wz|-%K*{SLAo&DEc(iOD~$oMG-P4e;yTWFGM>+Mi2BXTUL863yN z%mi5iSQI76Ub2B8C-AERk_dHQD)nAPyl6Sr`|e}exfpXd)>{9E z3H~i_!ylRsJY~p#4?@P{Qf^u0H;m zeU#Yo@?GoHkk2Z9!SmK|&S!~be#-;Ca?QO@z!JVEfy1J(C zuH5Igh+YP3cwg~}2C1+jF|LG8)M9-C*!fe7B`KcH)_u`E{)o8q^?GiEqx8oP+%#fM zqsKZ+w+CrM1rh2eS{wT`k0Kt8nC4C@_+xzNF&4V+P;O=8({=;P+zS*ATu=s|S%O31 ziuYwyH^KgoC3r*5ai>D$$GlLlr64teTpy$Fn^Lu%$G=lQ*p3)H=WWIOvro%5U{=Gn&`tQfhPJZ|K;MXbICepY1Jp z)Od*(hmEX`%cp9=o1ggAHiC4ewjW z>0)uIVVqCXUb?wqa!y{$6T~gE@g4&1t$fzgKF+%_X^-Bf>jX{biHgti&pkvj{(jc8 zZlQSl{H>R=oGZo#j81r1kT-R;FSxvHavuLC&gP;toa%d|=>(J3ka?uCy7i0M3~D&? zw7nhTYVZCP+inU|(o zYhL(+*W}Jz`NkBth=FR;hyATszQ52LGlVT{;h?d8!3}lPh#|THPb!VHxpqHJPu1E9 z1RwUvk(>3qW)os@{>H}q zCiM{)s@4O}x5fLJ{GXchUK$UZqN5Xk&b!veI5hoKdT6|AFF`|@W($Sy4;3jGgQ`q- zi|-QT%R`jz8u>%Z^4H%x>C1XC4harPp#g|(ugg-&#SSkZ-gJUqMQ`$KeyfN@Urm%c zLJi*h0YrOhW;9V{O*kgQ@sW+MujypVTxpc}==aS2T%Bp`#?+l1To>{&G-ztA_}87= z@cN2=Vw4VihUlD~QV7UZ`n_Y^a+FG84jx*e!tgQxsge`RkBO72qq>_t&BRlnK_CFe zv|cwRxg`LH7WlRcydGpF`0&i1xt_`=?s5V}X=2oH!oMM_(2sFd&#laohF9cxy<0_$Ozso=u9gs3wYsIiC z=rZKQ8}8JQG7%Zr>A6pa&KjrmsVcaoKK)y!*)=v&U!wqBZ77!C)QYmSXaGoR?2Mmx@L#KNJoy z0#MSvbLNx6r5m^!L8ybW&^5`+mnwcgFQ2O}75^WU3+?OisMdh!>;@Z&GI(@9cTYQEgJHSGriWW94MO$%;D=bcW>m=^~q zS6mSHj)|m;?Xfvn?i%id{P7K5;Pq3HIYZcNj)K>md(Q7cs-j>y9Kfy;J@ibRgZlYD z!2gZWMs`DAmy8Hy-Zbdsflk*N=;!j#LS@?$v@?#32$Xusdt=U*>HAW%xYbOOPzRS` zQWm?kdT4gmC7w2OC;HO}srN&baUbNAWd~dlvcvI4tLuywFo6tj;N^RjG$;_2Q5k!U zLB-A13@vsg@bX^HUX9qf^(_;-^B+DU5x9P;`VY_#v1Hoklm7^@4sHm^{z`U<|7Q2w zb^ShD{(jP7S0GZ8NL9ZTZeDezOSQ{zo^SAnZ7Ukf8O;QH3}tE=^x5tXh|WJ(9L63c zqsiI)=e=&BeEstM))x}}VIro2t5nQ|58Mi+tH5KQYaO)6ppKuO4Ah`$HIkRXNXqOp zCDL}}dsEKjKr*A6l!%r3x-q0E{cSUz3y!#5C{$;^q&W z816LE^e%E~wkY^~u^)z3m_qn2uiw`R;TXaAjqN;b-`C-GtQN!vJ1$!jnj43I(CJkE zHs^PRsTep!yMj22x4tQ^1-`VzOw)A!+a-*d)-y8vrlS;fa6|1F2uAzxC}QN|!sm6V zhJQ%CACU=QW)@gG3d+FC=a}HXIUgxPi)Aj@CxiUvpv;z0fV)Gv0NgMC2UyhfY7V=2 zY#Em;;tAQ~3Dr;Y&3NlyS<3UN?BZe^4;!(2m*VREU%)txpT z;{9Jao|>IT?Jmwk%*N@!tD_&TC$rG_+iCkw%m|dV5mobVLe+ZisCC$QoVAg5@07GY z-P1JM8Fz6YL&N#G6{*wf-%Mj=KpIYWa_f3@(%^{}UqLlR`9Xx%P9zE<;QsyE>*AMP z`u1~9=!o{DcV)J5=c&!Yd#XewNNQ`wX0C%7!gH2YwLVt~ zA179Joi3#Rn;=)~76nJMR}1z8`h~^&Obz+vrM3MgLBfN7zzydWA}r&sCA9bbOn4Em&3eImDV1+(X4%phQmmmN^j_c z`!wOuyQ$-(t>qFKs&5pqJ)yPPjqTE)2th`<~xDC`O!;O z`|Egc_Q0n@tw2uEonw<|(q!nHLR+$k^5HRw7Tyo~1J7QRmFM!N8cv%rCvTNM*g|-p z0x+$tCe$fO_X~fd=o=MFp^Q3V zu4S!12``6sJd57s(y13YIf(NMyp4EPFsIkm0cz2Z(a+V*NZD`k zxXuR^JwuefK3-4*is-9bZ-h$Hw}2lrtkT{I4L|5#70A_4`eEKG83{>6Nkd-El7EN4 zgzGWMTc*V_!NE0(9hi4nz!l=`${=RZjFKy>8U9Z&)f8tl`UN#b3z9Hdy-rRRwU=wu z{<8UHPLmI3{f+ND)9jv{z7eN(>azjL?5)Hw$khdZ@5xSJ$E;3ax5i4=wEgqF>u{s5 z;f`^84>5_r!|u0_V123s`eFXXNArLwKN1iBj}Bv!LPba$!4U}kCoCb91VuGPWgtN# zT9Q<+RQmr%^8ZG!Ww~>S?taIe|A|=F2uS5V)*J%*d#wzvDs@0a{?aWczQu+%CxFV) z@$>HVekUd!%9AHaG-{j3EbI8(0VUK=co;3oU%ae2c$g@I?fdH>}1N{ei#JE0Mze>6fd4S?DkOjt2E$`>WAP2XhK*u#Y|bsh6yWgU66kQap) zg&lU5maoI7ngy-)PF2^2(U+{kLWJ2=#KtUkXshRn44hN)MbLyD?%iIppksX53NY(G zz%ed{0{Q^tiDaohxh;vf4-y0g^%YsZs{@gi*C6AcYeV~kE`hYainyH>^O+%)(orC} zeKr7MoC~f51+wSk3%UnLl+Z5BjG>BoR}v|Ae;WzE+9~^}F$o2n=+|+4$hPN@^mUFY zq&{BwnxtKi3u)#7cn*5L6Tg!6B8D<`BXLmf#AA&_FE=Y{gV~Dd*pgk6s zNB(6mAxPWe#oH{Y+wDc%c7rf(yBqiZxP)Svrt0&haT0&m0>d1B46o~E@DcJe5Wzw2{b)1hL>;l%s(YTIr5y%2ux*OCLwI&W3ojy+fPt$=9y& z4-?8^P6S>b3+nPuyA;DGH@KkriCi`5kZ23w-9a>!Q#Rvnvsc#eR#yrtPp@eE%3V1W z-$b{vTG$nx;>6^X%|qPYJ)mY*UDcFN=OsclBT`Yx-06+(d?$yMbZ*~xkw74 zF#qWUyPtk9kEzj-lD$T3OkZM03eL=Q;_oc4ESL=&E<45d#sk|;h%0Rms``lg-Bx@y zMwz@f&qo1UH?cEad;xBL2_*Iu(3cu&$-J`HvySxH5n4kMeX9P&SXDZIpj}^$Dxak@ zovR(DcpnVjanmmpsUo`JdE_fwf)DIE@HjSL?NK9ZMcV3edp+}th-Ei)?E?gGj4xGb z>5bD?2s3Q3bC!jS)X=8DXiB`bP@YQ@^^4_6jK$_7VA^|~_b(`B$3;nOQED1C|BOKh zXJf4@{Q=1CtFd9@)h=3DX(f79YtTMYHQGqfmzGj=oVhC#EiSc~=s^qoEuFMq32fpl zdAja7W^0W!qwIS;m7472qk%z2!Yjt{0eGPpkd4-MDam&tyzD2jE=4^P67x|H9|*O~ z!J@vkkkwn5#TKWd1kogCvWUWl;K2pUER5-F3WHWxM-^zL@Bkrh#Hj^rwOsNIq$>l@ zd!?96MJAub_;1OFqr(vs*v4|DIMPtW^XEo1O zd!}}fPhV}ru+yPDn}V=RXe!$4ylwGDhiZLo%C+(aRJeo0J5Q1)m>5-H{<33Aw_c-d zy75PW6HZBMh~+GITNj4-f72@F_0vg(o+hq-U-ag}ST2ah^E?%>V1yN}x0sbne4C(+ z_AQGf9F(WA;!u0r?w56?g3;?r^c*J850-HIaTjSEtWr%6$7rI%?zI#TF*!;DA|inR zgY6-cHa9@{*yu;KDW25)m!21|^!8s@huh z-$?8O-;FubU>^v`sMdv#Q%0yvmQ&S0VvI5nc(sJ=jqR zHI$bLSD4TUE|={Kc6fTMREV;k>NKeWS!S={NC^O={Xl_9OXg&JhEUZ~{g9Nwc}0Oa z3um-bQI1DdJyc@CN5X1*72orB@LDEr_7u61yAo{O&=|j;yaM$EYAlAz+hPhx>;O$g z4$L`a890ODG>^cm8ZXtz>+k8;sB7gh4Ltw7f|<;#f%3Oo3-GV#RVVu|R)uXhn@bnD zggBTj&P_z>*C8b|mSxSRrI_3DzA$G$?6R)tY-u;f&C4je$3s>1g^9Ji)z|v;8e<$c zE7R5xjf~Tnz*zAgaIVYd=6y`tBmJA|)OC{lv+wzdJjOWvMO@v@XFKB+_tOUzU+~)?bo`oU}=0z7Yuo zwT)h-Tl=k$9=KoXB)BDr&1zBb^dj4pI@C+3cC7L%ZXenG2Z++)hGqal^4(KVg-uMv zEnZahQwiUic+})WDMX@ANT^2cM@p&}?mVh?>0-q$bRts`f02msCuI^ikIdym>Fb4Q zcG!xnpb@7L^|0sHg|;T_d2@&{FU`MJGcPT=hy|aq<6p%?eH$a;E(c!%97is!ImS&aj1!r$do@>{?V@yb)GWI+fl;{G@ygYD zMWqZ$M`y|@Q3(cpO!|p2ir0FsGPG7y=TWBR6mxWoKw4m!?~0MQo|QiK8hNq<;|?%x zenBOb={OfYDcezJWw)5j&3M{(6>lEiDvi zYr6q15(un&4RBw45{e{Ug60xu(%)~tp zMK4OYg2j$D4j9gg0j1-aTaV`U$+e?)b)UN#?(brUnm|emlP5n~on*MMuILKvUIa_` zvm`1hKqP4yhvbmJ6_ATtpb<9I9nR=O17YOvq(MH^AQ+FR1=#)c7=E@c-wN?8n0QF2 zBZVl?R-=Xt$WMHQ*(tG8GYSR zxug=`V~#kVA$HVFyqyb{wQ0kb3bLBGAMsHwk+k=8L|7Y1aF~($$AZ-yID=s&@lV#f zz+w^=N4x$mq2tG{6Qh8@sk%KckA1PEBY9j;y2~`7LLUw-lE83#E51OeVSCJA(IF|#T4}ivU2#r`x8zQwZalbw2X4qOwOV|( z5q|MugyD0LriuVBbv4V+!^Ma32$=h0#fb5!PA{1CQ@!F`i&Jy7(j!kbmy2R;WhEN#V_>I{4s& zO=~A3+zk&ve2iC3*2sAWwcHz1IyLp3j|E4+pU!A!1lGd932+VLa{Hz;M_g2aog^x$ zD8nW-!zM#a1<$Ny*zqN|WYUwqYkh5~vdmM8ukvM*K!`tBchd2tc}&hf8ca}`U>x%q`u+E*6rqhy zrfg;Md!TD|UX(_{d?lHls0(8>y79(3YuqI>z_GV%Ny(ESL7+pg!5JZHC{U*7n)XuC z(ekY#jC}Faang%@BB+~ig6!p`2+=2olqP4#y{NOC&hQy~t6_b-AT+`V!?$cIgL$(Q z-UOqORI6Tx>iJxm#Z!B@$!b8Dtw#Yt`e%ppkQmbytw>~R0@1jGtbW*`)KTe%W(5D~ zPb*AD1|FpqbWPYUj15qP2yXK)oPib?Mp@D{9h2`l+ABdZgI|*f<0`x;t=Xc z&vh)RAzGBL6rRx$Qv|$7?xSmASxP&$U&`QPZo)MJ42S)A=ij#NjnyU&aSTl53t1HA z3g-4VZJVSOoP_WVtkJ6vnd>MWpV@B(3BnG8Y7Fn5_EJ*n)qmWVQ*#F z;WeFKK#IIKm$JXKcU~aFC|w6;C;lg^?u^<1APVOs?63FSkr79bkXvQeQMo8DvzUx| zb#bLy3yWB862FnxcfbV#6oJF577|Y{msy5zu3QN4ncPp7|0~0m`h@o0CFeZu{Ho3h zgN6f@Z_&s@ztL4#T6n~L;dmk7Rw^nHY5oz|(m2BxpyA}_}%Hl+d;f~f5UYF;) zF$=V_VNcb)#WQ$C3+xJ-_P9S_yR>Y43!N9R^c8$q$^UW8)c*(Ig-XR@Y^>k|SFsGC zyfh$GtY?uz2k^RZb(p7vuN>IK3ZAzDO|8{?G~#M{vD6zH$w1giEtDytL!pd*mFdR@}{ zyXR_5fr%$?HQSW=Q?X&`E%kz&X%ua$b;A5AdavR|Q*GGqk#M5-Lx(6Y-6UCB(=8_3 zu;+PDHlFB;OEBs~IksyttVhC7|X zeqsqh!S-9zw3>~j?iA5mn)JtM!Y_`PsR`4&N-LIidW=rRqv-b>^5aWkFs%1DPT1~8 zLaSZbQYjMOrjNw!f#kY3%!u+@C&h9YmaI=qiG^cpv{b6TXYss3U5YWC30uXTap^nZ zrUZgJgfY|II4{4TfAj1)BDKSb0>9{Z;6B*I81qRF&`cfAH5VK~hks}|ylUn6gsumv z;J8R)i@B*n$+z&De~OAnbv&StRJldj@%nV2F~9DERDPJkXQy;BMGF}XydM8Q0Q*1$ zzr8Gq10O8qI}$OP0C5G5-lX=cFEzr!8>12rQ`ef$%QS#)APnvx^HZ#nBH-sew;cY4 zw2ZlSWXd-rY2-ZlgL8kgjD0DhUBT6`Kp6n>`BYq-p6B$Z?ZN~s3GIfdb8m6-q(~Ak zBru=m&I0YmDgYa9+?G@8{U~VJNIZPp;M1Jv8N*<9;*(*pu)`#;u@@w9^ruM_(c8R< z6B$=-)h7yb+?uzhT}5v+qiHW7 zv93HV7Ng<_FHEY4Ki0x}I}V52-n(#E^-Z|TZ`ZMfd_Oj;b34KQ8({#~FRes#70gI@ z+C6dDn(PyBxeSYi_c;`srb$^hDM9k=-P_uTU>iKcxgusD{d-p}T9U6raBo?=B0(j}gxY917l4 zqH2}hTa8TjvRBG*S{2z=E;z(WkBzrYIr&A zS5-v@H7B5}v7O304hg8i-1{4fyDWI%boQ#-6VbYP92t!Qd8LYE_c+>yZVRmPof%&@lH^2!Y+1FdHJTqG6Cr6s+%Y0sMd~*6k_jfH z(kC&-`1yWrl~!x7wp`$SMk%W>OO;S@o}#Cy&Xzlo>&<9Ga0>OVR!fFe`A$VxvcLV{ zLz9}M#L>HY;iF(qGAo&1JTVU0P^BY-z33f z$2sdv^KLCA@?=89^O4e{n^U)l@ai1)=~kkgTt&Zj2^0*5TbUV17*aaa_KLE6+neb@ zby!%i4ByVHL#x^r^0z2u?Vgmt)s1lH(wfMQ#aa0D6wz`7d$GXgxqlbygHEs$+dukF ze|CfTkG(QZmg3EPu+cV7dzxb2#1w^LmEelwbl(-;U&K;qS{M7jydCPYeWu-|l6$aU zcIMc4rE%Ek^G z;zGUmQW{2qzab0^jtd;d%sWyEu0>gbe7H=OIW&In=t6`s*M+rO5fc>B5Cx}4L?08ckA7-2>} z=oA5<(!7l!%D8XtwhnzNC!T*LB4AcRalbhqja33G*NmeTcKM2`;C_C!V%AHUL5Q3n zI6h+jw8UAB9)H~|%%`~NO0rD2-r%U?XsFDdT!EuU~9gVXFTC$U^7-yFdPfYwY-c)+f|g#u|_{sTy{D?2NErA zkYJBF4oCZ~U0t9H8iUj2=N^H2*CF7>!rrf9(f_1j!kKZ?!c@C)I~ zF?D&UZI#kRyJb{X#^Hc_S9cYo#ci=&`TM!{_pUp_A|=$dH(~^at=LNI|^yILl_D{%{mDKwgB&1nX<&hWf=3GF;7t5N!!;Q zDhr7gHd5FibQLY6yT=2ANDB26bGZIMTKfWlj?_xZ@n52)& z%OtaioGI8yBo_3~YUppYFFM{aWT3bwhHv5Ys#ZC;%IRS=cLFA^6Cxc$>o&lHUEEWu(c z^8kANDkaH?AUWzn^vAiZ>RvN^x)mK(q+x$`xtM2mF-8W_v^GJ(^rnKme_r({jfio| zmB$44u5HaWBXC0v-k3a6{DlyV?jsG6{HgM=EK0aJ>$GqWprqW$tgN^{cc(OMpzbvr zYpEMf4*=4mFd??78?%E=Biour`A61~M5x0o>#!5hQwefTdCyX6ctl=XZVFlcP=CAl zR6AFUejxhOdlH32W@cfuka3JtfPBR9#zk2`?4n;RZB^%G5lOv8 zCP-C^5bL`gK;u7+T80reGZbqR50rcQ)pV6#8I4b1X>KkO1kC#*iY`cQGwoi5H5abu zJ8Z1w;cJCS2+t$w?O5-x`LlhB&4tX2*mA#eJa=XtDx|M-;!Bn&_ZE6uf;TP%l9B7^ zwYR0|*3cqbTcSjy^hL&M926fbcWF&qk%Ttr4>jGm(^ zI-gTc)Jiq$;Z;E3PSU6IL{ zM&}-c)_$(S2;=)Z##x&Tr?25jlvH~*)bCBx?i+qp8SPU?D_Gm-D)v;o1=U_dCgsYUE-Xqv9o98+QYp=EsPq4%F5Q_Jo0h;RU!hA6yJn`6 zMTT9&BNdG$>=sTDQR5)v8*1&ct^J$MX2AD3qfwm>nW?SZfn&}WjMk;)jlJ9?FafdD zS2N+%TWH-wF~esDx4zbnFdnTk8O`b%b(-#XlXFJgvh@{WOXw`&NCTbn4hR{=MX87l z(?c6{uqz7qkziYIPNaRuSka5tO#k(+D-+!bj^Rbm& zf2^w7ey#RKUp8V@xyE`{hM}ox_wgVup~HNxIUvz+v3L7k1hsjm%t!`sMk}AbHafbS zl}P->`>cP6dsIFwhFNroV%_Gf&H?#(_2!_nlkB!?vcVjA8$)9x&^r;VQ+Y7#%E`_O z^k1zpp5co~$?`GUrs8q*t5;Df$rJ7-MO=aoGfi=AD@VE)Z!C|SZ#kv} zi)}nmXM_NpY#l`^+!lyOEw#|8QS0QFTU;hG?@z7-Vk6z*@W z0xVmyiMJ{QiE^%koO*pJlTCzG5-HuBC_ga&02+=z-#La!Q(|PV@Q{7!;wV}KV3<(f zh%^AjoxJ389t#XP#xqycfXIPHc^Tc(sa#H~(fP7$3UFjA%X?JQ?T-cp3U(JRNX|QE zfdgK7cL82i1HJ}DG%;+jjY#Ytqi49Pf*17bYnzHY z$!+6ylr%sLd4;e~I8om}N`qN$Rn;v{oqIi_FjtMjlvBI$&-AaNz83<&4m6;C_wRH6 z0DoG(b(xHpmj&VTu^!$&U5AVx{{UZF`XAw0{{V|`rhmOR{)}p*OwPd3NKF*T9x2qo zX+fX~1xU3JH6qXjn19x%`@+6FlFn(bbqk2|w%ysA1cS@%Uy;4v+SSV0rS{Lvy)*i=X9E%Z|jJ7H}?Z(cH7UBZeacfJo2lT~tIWuI{6gfydph zQ^Rg0zrD}{p^TZeV`*_aN{Dq z1Wz2XEMRRJ=niX)@ZE&1s@~m4DB5LZWp0W%>0PR>&ATE%&Bo)cv@b z17@sR#_HR%QCsbdW=;v?KBlbNu6+(pYhy6Ew|S)uxZ0fH`qGPpu(gtTe(H>pIjc%Y zCq3(!)h$}$^J~qsB;YaN3d*$Q2;M5}(wtPCq*T4sWWBn5v6cjqj->Lu5Aou*be#=t zz@Bntn1RVW0oZ@~>TNGZwT98YR>95kMPwu39)}etMgm)U2(^9fU}N@HnMt z)65bC`MAlcS#ti^rZa3{0LZS+RC6h6b62`#(A*=-A(jS{%ys93>02grg4fL0?xcZ$ zIjeUcJ*=PtPT#sX&2oB-^2>38jHwUbUU1m%D}t(y897C6bwRq;PoW;Ic9%2608pvb zvCa)dw#{(_dm@rNah4n(dVzwoNFC1nVR+*;xu;m5D6)Bq%Z3Mk>t0nVPZ3eKvpXdv zPAM6;mXTXU9s@Iz^N;S2^O~;-L~Zi#F^p#(*silxv1zOn+r(s>I0HE2xvpB=qRknR zIU9=>`@fY&ClBnZt86+|IpK0^#D%4jYhZkq8TvPTRZJE+Io;}cs}4NNG?^8Z6}Fx2 zkPdn+GY=UX(}KJM!LCZH-q6-A$bb%W)_@1rrgjWCPQ7X)kfXP~V$D{hjFBz8Y#8~c zleF*!N4&a%qXl@*I@FOyrNEL+p}0AxPBt;wy{09zJW4>?p^x4?{${zWzZysLTWK)MdpSG-!UyT~{Jkq`IC<1|iqrb) zXF8FN?CGzydkq%Cd9J1rl;h_9GoFiBc5>bLkc*8*A1cLpg*P424^#d%PQ$}j7k4SC z_<~_`a=&uiACf<8S6AgQRUuSvJ3?1X>PCv?rL4R5^#1@dBMyCMu=wE)n_|FKzz7FL z_7x+dTgX*i(4oO?w4PI{l7y-k45W|iQOhg&vc|G`?Ee5Oao6cwsy0_^Q%3IWHsPX3 zL5;(?Zcb{GGq!f2X7m*~ws~Y!MIoEFr`EbnGf7CH`IjtuwRKR!O02Hck)2A>gh%Pt zo?J^XO^n=?9R+N|k%L5WENizMQH`DJ-=|NNSsq;F%*zZna#aKcJ&wA)a zOq~vU;+{PPMxY;RapS#N08R}q-~~31mVhCd3mopnDwQj=^G;$U;EpQ3)JZ;4c%TsS ze6zRsO=sW7;JeEW$aXA| z?XpiW;4scX^fb|{xwmC5MrA^pWR>ADqaabVfr0?< z@A%L%RJ}=HNaaB!oATS_``-0+Zqe+X8$%pfAcVpBy?+{Jm8RQXFq&(i%OJ?W#?$TT zOLwHpZ8#RG_Ncbyo!jmj0M*kXk^o%iYmJ6pxc>n4RL>+9vf?Rcm>iJ!{WDpNC$!Te zd%eh#=j4sPU_O;j*HU#49ihY^62e^c>;*9$E@8L`ZWGV~4oIrHrG=C#ZbB-i-IyE} zKVMqOg8FN#cUelt%6P{F_XjmJ7Z(>#wUMTd96JOYr})qX3u{PjNN|!PzDCy^53Mb@ zj@m|$HuO>zh6y?Lr%P{naL%g>q**6)t=Ro4MY)g@1W@208-l6JQW(pfZSF3-$WRMt zzFn>e+-f+kB9bWgq>_BcY-6wKNpmchST78!3k>6e!``e~vm1@jqNHO4ZsVuDFd>dP zp`D+}%41`qVTL}nIvX_eO3xx>Z3*0Z(lp*;PU`zt&He$JNEd8KzE1MYcaeY_>L?Lv zKlK4*nOZgAJM}c%lO60Vv7&E}jy*H!L=hMHB7}Fd&K$S(t{EkKO@!2CoPe zBIYsjkCX#YiEgd_aTxiVw_H=>xVM-_Oncv-yiRza2&MB`0g@8SpW+#&Mi%{^B|{!K zBo5TCB$LR`vOqt8q3QL_GAM#Ebh+QTeLp$`Gs}=jvME*hwz3|C4&toI1k9dcg=IiK zRp>|KOg47KRCy%{&m@q3w2?^g+$>F>ExWT|Vwewsj8N?@fDbL4=h~cP^Q2W070(+$ z{DXEn4Kp#K+LQS4N)!0e{Jj%OC#ty~) zezne~r!;U$ZyUSB>|72vhVC=kw|*sq{{RX0t(M2{6OcdYn&P=Nk;VaGk2Kd6y#!Ay8i&aYv-{lLT29^kuqU3f$z!d`B%_?3Y&L; zEl31@{%)r~#Z-uq*_tRP9Zo3$2i}?_c%=rKDZ+prDn+P;JRDF3s88<`{o!97++Dg} zT)@oUSK1x+t~vRRdHPq_Fg9dQ_p9UymLqj|WEuVI^f*6zttMP{G<-8^(A`Nmky#rl z!*U2JPadB2*jq06!26_~p^t3lxX%wrt#fq4Ac)aM{{Z*>Yq8TVB1?##V*XTeD=<#D z=D4bV6zGS*Z1`Hjc)V4pEy~Fl*siL2Hh-V=uDbK@#EKQU!({S4m7Kz7yN>1@glPUp zQOEuDQZSzBlYC|{Q1o~EGyh0`(qmG?xQYIi^ zR*21$Nh>3qt{2v-wY(EdC4Zf8R|lsxJ6sLStTF{e(x~B73Qb&TmSY=KDs&Y(HV;av z6xbZ|Q!5fODv+Bm=cPp;!ylzJWoJ>i55lC~kCdfJOK%uzv6Fdt_K3jkQrIXu@CP;8X|^`jWg@e;@>HLlqg<9A=hnA>u)!1)U71stc|BuFiy`>Ch+adHf?Y&OGz5;3AchhD;r9R>2(;c)&THitMW3w z{{Tw#INKxXP_>)_Ms$cNZn)yTXy7SEN>S)>JXG3gn60KoHQBjLhdkg`q)O4K7^$I* z16Gv=E3Pmwaw}&8J!uF@2O_MzW7CR&pwCQJmLlYSPS9{OO;&KG;9&QvceB9~iCL9EJ#)nXV@{Gos-Lso0>ujvCt7mf+Rtom8tNY@c$A}O2C?sM z(i_HUExgCwfF@jE8q2!StP|yKU#_P7PX$vFDB)x3{sSndY=zzij#CPn4tXJCCWRG>IkKrkvhfY&R(*A%6(sG zV@`J{Pzi6Rr7Migkhy5t}C%?TV-R&;$@d&BMZlkv?UF@ZVh2Zz`p($gMEaKIr%T&Asze zNEX)Yy_}n*Uc?Yd_VuR5(_P*owWBII8IY;!I{Q!pO02R)vM~sUZzy+e_p0+;$kAkp z{L>)~x;_t~&1^lSR!OpZHeZ|N2`R&RRUxO{O=%+{h*jJDTLQs|=vV7Z23x~8kyS3A z&6TsUlmHvnnRTX#6wfP18>lvzF-UFmR<&F z#yce`EL+Qr1aJ1U)Y3pKR+8E`nRLJh`3c>)s??1bwokI#m1T3gB|nSnR;IW|Ngio@ z*w^MK7$DU*lgN=_OPLvV?@)>vhI{=gghd&6%g-xDkuG-(V2@1t*2TT^C6||JB0`VL zNO?K*stESdv4Q-;%(-S|1h74+s|+m@LcVB^t07E~KMs@#y%ns}D!ZWpxh=aVBk59o zkxkN0*%A-D$^2DD85-8q2-0Ba$7+>5D%#z#VdnnqusrV04FF3tNpj5@cTBJ=lisOG zV&?{8S&2Cj=cPWB~M@p2%6pS6Q&sN>n+JGqANXr>dB1S`kTR*K?j3=48OqNvT zam79~w~7by)VjYLn3JDcNZJty+8J9LdFpA1Mv1noA;e47LiMJ@IwYSeBx-yA0D6g{ zl*Q&J%%k`-$@Zv(61s$vK^k^kbM&AIWNVdV4Yo*|?%+`-nv=*1qJT##dsNYW>-Ta# z0PE;QOB{hEAy}OK+F(X3n%|O$Fgh=#MzJhz#h;Sfu(dVJhC)ngOLXg1iGYg#`daCs~^t5HapeNvs%tXhFLR?fZ!Ttz=48=aqrDAI^jM$n+ zA^?Hc=FdF;0Q$Svto$;TB=ABaeaLR7j2vTd4Z-)WAH~<#7cdKbGf;>~gU;H#$lUSI zMf%rO@X3JiCA#hoMDpc_enqdzIA$;CxH-dblhM3+s}gZs5D zt>2#PtOAkJvtpNe4z$pAIp-9v7CHe+K{cBN>m)22EZ@RSO>L8#GK=%Kp$H53yZIjQYQP|NbOWu?G%|w*z=pe zh1PaHZ2C0QBd`F&GjCi2o)1x6z>Z5PrnZay1np z7$8^Ro#U|f>s<9Ae|Xvx(#8*luWq8(X0=(|TS^d^!fqMbe=6>T^9NKZ@?yN-!&5YV zB66I!*~c9T_2#__JH%M>`RDzg(v>%KIju{|*^o8}sl4)wk(`rRu+QemxI0IutzO0$ z9o!E})X1LHi5r}g-!*1Sa6u!jRE@$v1Dd(#403wY0} zNY3HKN*jvJw3*B8+rjTzKxB31m=-NzXKI63GP02wkgG>MQxuU0;?R9_o=q2VYx@*D%?+;bf(Q3 z$(@G=g-$VE=sCBzBJ=W`Q{;_gO_C@z1ZCqMwDUgiZ)&>=`jjV+$&t3FEOd_->s?r55^NH+|Fe?^9W6k8f|WrdMKj6Sow=&cBXXCzYZlUuHn+KPu#6)b%I~&#A-q zyHIf)(T^_;=vUUd?M6$xEkYfzhZ3A_9R*J&oo}eX=`<=gec(93`c$!;L)poxmP=UX zO`|{F!u?HDNMwsyi7powsM%)cc0Id(b>7?fdc^{~P{^{U8EdKy(w74n}LBgJ=zACh~(70))lq3arDe1YX@G4p&r0~YlCR8|!X~j{D z+`}rHlxB`Yf#nU`IqUC0$u1xutBF-{^C>6G&#&iGiw%zMWtt?FHxRfX4_xjQJ>|sn zB#})Qn+rw)IXsSgk5Nk2j}sy!CgE^4=j98J_NW3l+F1Ozj^H{GwlTXUv+AT%E56}-h2Q;8RqKIHbG2w5YEGc*y03$>kk zW7vAugf|xd0A&Sa`I&g<8TJCHtU0)2aJxucstlGrv*}KI2Uvnexk5K{)MvP+A@e#G z5=k0?x8?+9szv0*Cdares@YSsb_q2)MvyYYHXTCXJeemSTDKf1e~Cc#_Mrhvcv)t4 zk7TUe6(gYasZnH4lq6th2M^oYr1FGrA8Kqpc4sHjr->skxmB2v)rX}q3=w%zZ&o3( z(DbMb9%P+kc-&_SInUOvu=2(>08brgDw*1-7aZ|S3TbH=k;AdtdIQcWqShGK2@;gg z-Zgcih{nuR?(QltIYKd2AP%^q!E&w^9pEf#xaXd6Q9LkVWeJhjBee$KH!SVUWaNzV zSEE=_0xkeOyHEyh;U5Yg<~hh;=TrIOKo`pgo~EUmIN8&M+2|@r<3$5$C+k2AF`y)p zB=3XJRY?c$zjhcX8OY=VR^BL|rbao%Gja(4=zRqODi|4L*kr@EJJeNqB9=UcCRIEy z9M+V_3OE$UC8HoNayTY{GtZkEe(iuA!wiplu>`O-f1OA&0AD97?N z!0g358jKonHK7EmYGpiB3TjoL4;S&r+6Tm2_J?aKc`nMsAx6dL+v`~JqBOz?K{rhn zC5*^aJ{zg`{&mp!(^OqA$NG<&t0GMlC{c$BrAFhP-2vy)xXHDoSx^YP!yA&5pu=tc z9D4r%I);s2dj<>9e~QQsNh*R^~$VK0I$HlaYS%Yt%q#!vqM zTD%_mSlNki%+8~07bGWh_55qwei+UVgAgGn{U%;dQ_%BGY)p>T#@eScpEG~xdQ{$H zZtGFI;E~pbA!D1KaX@UJT8JplM_P8^rj@|5d0&@`rpo?s+{`xeG2X4l!yc3ngyMh; zA26%5pWeXDStkcIRjxLCqoAhw<-Ku078#CfD_xBwvjN=Ow**ziVT!F3l__E<6DQmaH)iJtj8rPz z)T-E~1(YF0Dfx#LTg~Uvq(-CzopPS_P1%v~YSXqCH4MRIQO7ifLd~9OVVCB?IH=k; zE)_b{C5GW-P>QF&T16U2`4|+h7;-&DQ;5incQ#F0M#!cuhk=fi^Hit2h)K5~lh9N*(K@(PcM5+h%?##UL%NU^vGWBHd#=;}u3*^q1s}fXDmQ5RifqkIYY6FJLtR zG5}6}s&rr!R4}&W1GuP+rKBo4sj5Vh>Xsu=v=OVcY^)n9aakIjl(zOyacLx`;TPt? z&fq<(Y)S&Q?2a=~iQ{Pe(0)%^V40t#>RYtvrHXnu0}qjN3sacASjU^(!>F zvoXZO)DP=f+I$yK$WmEC;kXUSB7iBbk!1{CQQqw@LgS_?7{D2}j@8Uc*%%xX_}4?J zM1@4q;7Gkk_lKop&-RwL-XbY5<#^(ihjLZ^%DlBP%3Tiauan-YNVf3F6i(!Y=;`u< z_P3XZiPl1K95 zjIWv#E*$;Q+NX_OAs%ncjgY~9QYv!RvMNTXeMSWVa-1H2*>LN&ayom}WCAQ?OjmP4T?Mg#F^|^6`U;QEKmShj4Tp zhI8NPT8(WSfIEPVocy(J4L%VYI$bv7{4II*mb$IQQtLv>&-EZRsInTH3A zZmXslpjlA@l|L|UfO}H|PSAvO8?nw0T0yBzt)W|R)5bwn`On^COK+w{GzH>m7029i z#~$2M;u4mS%rx?SpQJiy3MFNhjN)kt4Y3K=VFfr5$w6O8ZaZveU zJbTyN*4(^U}QsRn(%7;xzD1gEPen#8;~PoRj>kZS5go?0Mq= z4r&>#`W^v)Y;A59H(+D{?&qP+dhf$Kk1N9jG>ms#yL0qjmFGS#xV3$0;Mm zqwk(-WN=%Km6{|h-#1#GIY?kSPz9Tk8;-nosw){|^5lE^)z?kx*vQW{0z&BLB;e8* zf=qBhBBe%MzwG}2DwIZz+nQe|@SqCvJ8cR*X~Nrf8#t*N-y;KqLKjk391ipV=+!{O z5t?~m1}UtYaK@SdY+&>0KobMgnzI=MRYV&;Q=hF(8~r}C0c}-VwLf6T2Nepi1k|k9 z0+@yfuntKlttXgOf|{Ek_r*f-24ML#hD;mf^`~b8rfG-`k)AVFA&9Ru zfs^z!@HnQkY+U1-ayC~q!~pU0Qf@tISOPIjKP>|zlR3%aqw~IObQNKfa`RELgdB{F z`c#Z#GQS5HG_3?>FjN((7Bm{Ey__x@b&sO;?8&S#QmNmQsg z3x>~nyK(-rdYogZrbU)*-D(ORmg#pXc_7*|kx^gi@v%iw<@41}D^eBtH&RFhi?oc@ zsOU}3QSJW0E(>Nde);5zpUZjf;@P;U=Od+S-C88A_W}CTEiI$B%Bu70Q(}`XLWCe4 zJ?R7+T*wb6t_4Vp0KliIKA06qlD2X({PJpO;z=Rez+#?M1{nvMX#_S`I6bL|iLBKa zGUtJe6UAj++eY&<6p>pP%2wNxw>jWc3{{(fOcOKN7E5(sPEO%jw<2($q`TA}6n6U8 zK`ddAsNnk6OjZ)ba#}`W8-fQ)k=!g~-3*X20^=thwU@-QlITCY2XXZkp>ndta6Zck zmB&1Es}b8H!e)^Go&NA6mG+=_Gxix#qa+q1j>eWddzc|aKwBTi*zmXthLhYVd z{{RoQXnDIKEh-!u1W5I=~Wsjt(_xz zne)$F(UJKs5eL6+#r zyOlol#7#0Y?B2lh)lF8HHunzb$0PMA7t3H}eY_m!Bz_bOt`*J+i1CfYa5MdB zq7@45Bk~X2BCZ(XI83=9@Z{9KP#`W;H(U{sK#cPX#~N*R`OnRfocmRev%kyt7y*tK z1XZ6Sa5*DyPjgO>?v*_KDS*U*afacVq~HLmaK7}c5rzrP9zv77InQHG!X!i(O{XMx zrc4V0G0(M3s&;hReX2Z=dj538Mv;WUW5r3ZD-sDg?M}`YJYs|M4wz~HXD2_(qbr8V zKU$O^2R-SogK!zOYFP;Y9<%{mLRH7j-lQWqqpw^o9edSH^jMEHZx!@mBKpSG7Dlv*lCGzkGQ&vf5a8Z-RLoC6#A9kT>S(gOy zOwLXz0p$KX5B~rO*>K`D+ipW1yMpKC{HwIKz7nJ#DIRFgBd^!exnCXJ!v~1(mB*JU zwT04OLI>+vwtCdpR_Sig6rM?2=8WS2fzCRDahjHB)}~L2Z7t!~uEQ32r;zQ9KZlRK z_?qiH8>U~_*hX%xj77R`Y@hzO(z0}44P0FKk_-Ec+*!vV2XxtyaraNq99O1W>}b(p ze4y`&8_*u)*3z;s&A_b~oHFrQ=&bn!Vy&R^@s33zOKNa&PGRX$BMj8v!%PG{20GGi zIqOaogk!Y=2OGsyk{!*RwRr_NEd?3gNzQ6jjDwoCv0;rbmxI%aRso1hN9aW}=WudH zMJWWzQFz(`G@&+<6m{!N2-Fep?^YG^zV#CfaZiuw$E_eFW5DFlr4@0CYYzG5nld>& zk7@vtL6re|R57qdIH`AW$2AK_jGTPbhDo_xcc~F}wN*mluzFM0Mjc1vKo>~H4HC?% z6nCZxGP|>Y2YSr&o>_@McRW#Wv9k=jjZPQ}&$jy2&8LcDB`4+JQU>D<(w?Iu{Aeh@ zS^$DY11CIGK1!37Y-Xg8oDjg`tMA&}a48KImyA=`lh7P`)JWjtnv;3O0wm#*lg!)2 zMzV2DELh@!meNh@%{(?2imrfxPf}MrQxPGP6pFy}LEw7SN7NbsvJbrg0)1+zxZC%t zMVJ~uPtuN}t2DTL3Y{M!m>KsL@8*{|Dtii+J&S-it22GmiqFd++(s&FRm~M_sTrt* zn@%y!QcXcO1ucg3sbskUff%CVEJ>Z&08}g-k+%cuQX~0`$f(*tBp?oG86+y`etNPT z8mqhI!KbGqtu!KXMKQ-5QZECVenx57Lql#t2&lr~;c<@ivI2P>YBESUqyx&4$fk~@ z5t?hsih=GvaX`n(8t%v#`coxjC3DBE2IJ*99V(r{jG<;Y{{R|H2^wg|R-$Wy%JJ<3 zwoOr$i6n!Z@yBXFA86cIl4%Tyu3cwn%Oro@Bz5(tdHeRE87;#$-ak6BZPIA=$akLP z)_YimxY}}f!oFzKU~LVoMPMUoeZ&*UsAGYiXP0RXM_t(6`F|?1@i-`f$|>$nLs7U# zFpb=tu0bQOwIDCr!x{U`yS=!olOM8NG6=Ub9zE%*U1rNJ)cz7V%~%Y#7~aHVxa&+r zGAp>C4$ov)l?Epkc5go1y=dVnPSi;92 zQJ#S2pAH@|gxC*1HAG=iKK4Z;sQ~exT4EgZKE8XY1tU(oOh@MnH{N*Gb;=(Q`Uek)NG9 z+&*D}K!5b&=*V}sWt(wnz% z$0z+90`Vh?&~My}whN$FI} zmTa1qcEn(MnysC**opUK3IhRAgVKgP8U}GbG}CRh2`?UK+)RHOFb1bjDbg z67bCGSI!6mFC25%KHin-dcrl=p9&C+s;pf5_We57C3P%_vI7&ZmZja?<8P_suhydG zhL?kEys>*D%CjBd85EzJkMZkWY#=Aiv!0d5cs6BuCD?@p!Yrhmo&1rS?0llF$3sCS zuP|){cd3z?)MBe`&OK_(WbweJB2@&{h2e2kx8QM7#`&g%NT!EE-5(y*%#heL<;wfe z1e}5ILdnlGpl(`Wk8gTIEaxJWaCsF}%2ub2P0LQ=92>EvF^ujWohhs{%?3;^C>W@q z11-WWg7y#`T&M*&QOR_z@44)6=%`p@Cl;u1V-l9{_sHs1PieM`!exj?(0NO|#)ZqHj zxOb&sGdH|v9V%tGW3@;%EQ**UC?=V(HZ#GbJC9Gie@c*_T8ClAPZb{Ge(&Qz7bBA# zb*BkN>Y`Y$Ak&aEgs?PRRxLc<*r?2%@-tAm+6J!#Q6 zI3u+&2_{tW-?b!fgHyjG4DnU=;FZWd`Jf4d5=p0T8O2p;yuX#WsdZ2q~X28+m` z^{Vqb2=btPDwA9!s#U&Hdy0lzu?s4e+z;o9L1NjQfwk}rRG#b0KfK`wLqoG}8!&3S zJkN}%WIgl6A(NS3${ksKM>PZ7#^G`fRCU|b(ri^LxrkBNo((HRlJcxXs(NENq&pET zs+&vi$>4p;0JC@w|}8f5w3Tkd3$;e|@`sX}(}bX&F9W;vn#Tl;Er%X2H9u zHE5~F%%G>e0df_%hHsMuah!^oCwG`@b*QRNRv}}kU zD~gBvHNv;y>&iE*40+|-E*WRNgVT7X<3>V3iVq)Ax5+=5SPG!@BYgzdsM4r(!M zhlf3Q`qPZWVBiB$$&%ra{&d3R>ui91{NQ4uk~5D`J!z4=hb4dmw>2qp%k}G;0ElcI zeLX3`)PM&=>q+JnV7XTPTi~4tFlS_D}(D+cQ+mBn`A$G9>$mrLg0-4 zwDtvu7^up(7<2DYtI3iwC<1&R#-iLy5sql{_M9;0hLjPJKm($c%Lpi=Jk*fgvZ4(3%!X2c&XHpjGC;Zs&(m4g^QETATKKhn>>wOa=#U zqj@v|Wg7!EAYf{;WT~klS6pwP2|(tZfOtJ9$i*OUN_GLk0OFTz0MfR0EGr-MMs%^1%jqbdR7uC!iYsDwwM7^Pqsjf@P{q;Hn1 zg&isQ!0XKd9v_@j_$$UURk_c-Ng2aa80LhZ!k8m9V7DDZ(eFw0yu#K zE$Kpna86WG0<3U55w=3O=i0U5kOTmpYM$I24k}=|$E^!ME*U|mADmWp^SF;noV{~+ zBwhC|+_7zQV%xTD+jb_NBoo^c+qUgwf{AV0n$yqozQ6CguJhM9r~0a{+Er`guBzT$ zd+oLE^<5Lbw1@xfhwf{vrXxXlbQ=y^xxksE--Lt}h;kSg`Za5`I-3iO{y_4Cs0tir z?5G}YG%^ybA9dNzeWno66qjS#e9DNFIe2W-IKYlDX1@H&%OL|UIA-E)eshv|WsW33 zPZiWRN+J=`%wz@%v~EekWz0QlxA9(Oh7r1UMy!wgaouD?bCdn4xt4`{&Qew7DUUFdzS@Gv z60!5PhD5z5m|&38Os`B*ZA6e+z;m9N#BQq_KK!p6>i0w?4pv{LqO_nC5XI#XIYigS zYfyjphX>L?I$>g(zLGv(Tn#lRSz^ldYIXzGW@vb!6X0y!u1IH!YnJ?hIzwG^z12Jr5*t8X{EK>aZ)B|Un1pmO)?85?xUg%R5QFbviv-W z?_M)7-@0woDai>na^~!bO`X^}S)h#NAPK?`u^}@N_asZPNsHg{0H~ zOh8^~OsJ(XO8%!f@dnW=4#3LPU_e^S`(xt>|veAa+`LLjUaKHB$mT zYzd0>E1wI-4O-tSh72U+F|DSUM=Iu)VrNtq0cFQ7wM1XlH5xxEL^OBu9iP7 zRJIOM4y0oNmyfp=?tdx^dMYKVW&_W^6{5&zfI^R=o+e36-cd7}bzT?@R+~a7O=pQOozJ)Zh04N}2dZEHF-+6DhK3h+DD^zf z7K2(x%(#UiI|<5|L|%5>Uc@rjKDtr9pIxNfbX2}NOh5J-yND0@^(gG>%#qC-CRcj1 z-}X2xHO-AR6`f2hXd)?7jDBA|f!4{${fu3@X3)}qgwU%4&LIm^;{YGu z;XVV?r&H?^heqeSKBfALd#{PUG?v4WF=o_Vv<#-^Fd~pp^bWx$UvMP;8@!3LX)0Va5tM zXijL7tH1x%cd?RURoxSU@F8p8Z(@vV|6D^jIRnwGAa`v|H0;qFri` zB{%oz7{iQ*&(eA)Ht&xzp?X&X=fOSD6V3$9pR+FNfUmWghc=$%5#iZ+DTCq_&4*zb zA53|aGo#%iCKgz&M+`UrPI!$fU^eiQ=SM*$_WiAfbUlIHK?uZK;8J` zG%!QYdA=YiCoCa|fJcqt0x#jQh(%nWcP8v89X6-idSXhl8WcI$XRnZ^y1$$g|Dmts zmuV#6$58uhF}1pZs{*k_CZu0?88$MulMOy#lllOp4=AgM^I`qd*?2AAzIrK{8Y)L1 zd9WS(IN^*6r24J}%=J`CvYNk6!da;kk%`6c1k@_m8H3i$vuvy%y;>fy{9TQarq6jkKkh2i)x89Cf`z7Vq-_NX!z>3v zYy(%$%mleIK{3)pW027AlmP|>_J<2K{9^2Z6TgjXd{56~^iWKZ`>>6WtQd={P!)fG zOoA>Y_AuvFExw9~Ecf^{6CGH{JkXNzO*QB!2q^*M#b2BTdO5Eo(T2r4&%EovS$aD& zRJz#o4%dXjV$hOLHKkEWa_p#t3^abMWQ-fVpQL=2JxA+`Z{A4vUD(hkhc(($YC+KOsMEyLF4G>tE&9T1BmI=UWyjxu;v$y%jX#&WrVwo zZ%Tm1zfz9a^tK4xHG3`^Uya!zq74}G##|C#O<0qD)JGlxvoJmdUoE^?m}nD&sRE6Z zs{cOHYnr>fAP^lDshLH8dIMHf>L`%fuZY&v!&yV+n2+!gV+sY_1R1O;hwG?JWNLs# zK^lJ^r>X{lF=K=#F)2XTKsV_KLPBFyIWsh}$v%{<7W-Uzt3L9QuQ$!j?M4aXPyc2h^)DuZyuM8_zyEF zS(yH|Cu!o*Zb`OY9sr@x)4^xSj2pI)5(3h@(!jL1@zS(^FCJ}@Wy1`+{&VA3(nDE( z_?Xe%)FE=p+8G27QCI10`8AA{FyAjFa8GzRm^?N5*in4Xvg#`kq+=`^MIuhH(@3PT zG0o`53Pv*eZ73qvuWTwK1Mf9VBSRmz9n;J(A&ij`Ziik5XLYx6T&=~i;Rv~3ZwCui zHRj$=tRFo_*uww=x&<*))%UU6nXR0M#3p-M67n0EC$P-00WpQ!KZZy29Eic;4Z&rF)Q?$@P_xU z)Ov_qq@`_`h_Id;vCl#c3<@%ms8HD0#0+!%lyk6;YO$N=-F6LY6Vg)p4s;NKC!&qk zB4b7%*9;XXRkSV4&lCf$&9P4!zn>;#0q1H5!qW^pA1U>kV$ONP%Ng7HlNFz3Rd3 z^~^9xdI}gyw!Q>)&$Q9$XeXedibRV@spW+4&d#UxOrCBk4tT{uNbIBhX6g%I2ua3B z*GnWJQ3{Gi_{*L2qtx2H|5WutBl=LdqCQ%37$ALkTzGpLsZ6+Vw}W;ncVVrSq8t(l zE?1&JBQ(l&s#J-#m7AzjCN@b-_`pH*%c7u^IuFw5z8^g#K%{RG=*X%b(a*+=$~?v& zBculZYvEUB93|Ab5P&d!n9Ii^K@l`3jbF=9SS@SLuF)6D6|K?+w?4#|M2gZ*w2CCOs;Hh*duwm$vz5+*%YVbgQ@ zaegbXzpJq34rZ=yF6PGe|Li%M*ub(dbHFme{BU>T*%tu3wGVA)u>U>QXn?HpZHos3O^ zB4Xwq)~4nvlE8XJtleCd%w0qs?VTJQ%z+mXC@NuX2P_mQCaNZ8Zt7?T%pqs)U|fC2y@zz^W>IzSWv4G9Sa2>}fS1qA~G4GWKs01pQT zkAsSagie4rmmr>rLALXW^Q3=Wo_f? z=I-I?MF ze{g|-dIDE46mSRHWj z{~560|0iVsAF%%)*BSsG3RfV0ZbirsN(v-!7qfOLW|UyhAh z_YbhN*?m=$m;?^qxQSxvStCK35#5(}rjIja|LmV1XLia>jedeOeapw2UXjcLKI$pG zhIh6UXNt`>Cm}D{nvbOzWm&)9bO&UNZvI4j8K$BWx%t4ZB#KzH>M=6rcJKp9kI^O; zj1{ufkmjp%9Ciq^;!dTC>)ywuEZ%#c$7(Zrm}HNz7)EIO z@^{8fLMFH1d-9u&T@3f=CFKMcY1nu7lBg}6BHgGBobkDpQQ0r(^iBG@h|sJkmbh*3yb%G%~yFX54E)XjsjO%h{wa;LHu)s<*Bl{!PbzQo|BUzb5ZoF zKK;@q&*uf?%VW4=jk}NBYIsTVwzvNFYZ76IB_2lxXd!xE!6J^If63Uv54t^?BmPjM zUI^PaWWop1N$#&*tABGGMS2ULBVYVpE`*K({W`;m%ZQSKtai3&P?#gEkN zqhGv;{PVD4`b4P=B)+FW#QeABc!^6dt!zGH? z=%<}zXmWt)wc5dNwo42CZS#n9d?qK9eSzfk1wJT+=4CoMhD6@n9P`yyK9F;5)KlE% z2DtU8S#nx{7W3G1t?QycLC2VUdVyEB$1Vhi-#i*-P-cCp~>?!*+0V7-aaFOz`8+wz%Pmm2>fA)dz*b9#_L?VfUY7LoN z9G?f`qfkrT;z*Nm3e?(mSiXhF?R?UegRGbPr9_2O^h1iOl>%d+PlQ*z2iaYTH*s{Jvt+z4t|xi2 z^r+)Rvh>48MSILJt0+4FJ`)Ld9%L-qdSANo-TBwB}W-mkzC z?%5Mx6>5t}=Y-vl22PwrVim3Fvw~U4Ii7Xe$Eh4g;-J~&vH2mMXB;j) z!-wfo+O=XctI;KmY`X5LI`!wg>^}Sqa6M9v&s#OOGL>Yu#FQlkKOV+wU$&9JDau`9I1(`fKS@Ex*?& zOdY^R_}^5cUa_rSy*XYYl})>2bgjX=@D>tv&%Jfvl=nC-_^ zV^i+2`jHT0Y)7TNX^s%M`Sx?NX5l^+cMT=Hk_^6vTjIte%-Yj0W3YF8lP)C&yQlsF zpqlr`%g4*L_?r?paL~MPujZm z%9d0!@9XwIKUfG}jZshUC*cfrumc0{xrIow4^KqaeURNecDJtXWZsg20lf3}?=fa< zwjsWAmz=aF5okP7X|ggWBuB^k(bf|3HGc8)+y?C6l+5n)!`_i${3#xk@F|K zpf%(52#7fL5$xAR@E=&vl6NupMn|p0O(FbatMr-swcd(*tKZEEB-h8irsp%<&B7Mr zIIP}-BEdwh5R_SHy48sB;~7MFzQzmHgm8UbXmGgj=m}3CT76fFL`|^uxZuV75%O{N z3#c>s5aNnbtusDR5A_v{VjtY?g*P}`55e=gtJuVT!Z>95{=MBC{>`rv$yzHwMuFBN zLgrIW^Mz>7YcbHSh;TmPlNR@U*Yl@6W16BbGX>Xk#6t$i(@xzUdcUH4?7&4k(gOc`c{6B`cSUroJme6|3xcU(zw{FDU6 z#|oGmU}Q#)>a^dEPOQ&rLOmX^ps?^v>ic=;>`U{RkDKbMJJ*xNuDRovvs4P+I~u!U#!7-fK1cuEg|}Vza(y z8C?5KD6_f~(+EK2le0ToT?s74Pz!Qu?3xB{n@oqE4v$jy?rfMFMot=OtW$9q7e~V79P1@6&5$kJ(G& z`iHHD9|qK^q4c)kwxG@{aGcpYqWLu}c|sL8g!-aj5=m7p7aU2I2MSyvYLfcm8i!kc z?YjU;JN8T3JFY(1OZthPAPZ4bGL93z9bY>+%jhEVW~Rw(;WnbLF7alC9@%y%KYuK~Z8q54%W`A3U-b zdwRTOKd`Xux{v`rZC!^52@}_5sP_XaalnO+DKrGObu!eHtp!Tv3aN+3-M9fPwh{{x zU0p(R#K$EHtUBYna4SW^EFYubWnb!x)}UK>31)u*?b$+WiQkKRa++iG#M9c#=OkeQ z_N<&VI|x31fZG0q9Z_KYwSTVNe1RJ+obRO%R*5K#gI&wIWN+Nk_BqzaJ@KY2i86Ds zR4`=BpLwJx?3S(F^h~3M?(lM`SjK+wg>vh;Mz;^-wpf68%~vN+I-~8Ythur1c(Xr= zmOsSgV|8hNu3l>?rPKeD@i3iQwCqgN32A3`JH}qbZSd=DnR0hi?hJ3YO)1Tp$~10I z_4oU-Od>iqC21zf$%Pye*A;c5Zc$_L0CJl0gToX*-lD4KVGMO9`|g1{*`L5tr#b>_ zxof*OTe`_b%$+zT-$?K5VnCD`_Gx>goLSf{U;`&9#f>7Z-F)NVB1`t9zW~_T zfrGjL^vd%f`QeM)H}#yJ`KnegmAi ztCN=8NXysXfBI!Mks$CNL34402Awb-KwmV46g!Jnn3x^KRcC$P zuC$f{j}LD4J#5C?kcIXPb9FjfU~yWiA2eez;dn0uEsHA;gn;aRt)Q@WNz*qt_ z7u`?CP_N5TxsgxM$Uh107u`sEk~23QNWDtbQaP~A-Qc00p+dB)ckMA|XA4i`z8-%C ze8^+A=~u^PAHDQCBq32MSjG`(*NI~ipgIJ#o6?P_>n+@u8Sf<6cR-9%PA7b9rnzy|LJ`hFEIp{w|AZzH3=pUnYr8FF6Ul8BzCf#BevtG6WU>`ASWhI7PDHxgH#WZk!z1+=fn~R6#~=!sc?UzPTgF zBpoinjUSpdx-T}eP?56y;@}B4d}A=+ZvB8iKvwjKjQKadMzs3m9m~2aYV~X!g{O@Q zw+aak`xEc%(!On^Yb1A9PY#|kr@$|f4qL-e1B4bO$QZQS=!xOkGuwvPc_r>l(12B& z4EkI$hPEqFi^5D=TU;uEQY62d$^k5A3?4D(Uq)=&B7!vcv*_P38k!n2@1;9a2IH_sZ*Pz~gd z#)&=jqMg@^jcN7QP3`S!w3fcTBL(YOLXE?{aVzuJ?H=5;{&rJ6XQW}ybs>(4 zSt3m3OV3tiSwa0lE8WHH4>LFtz!R*9WMkze=2)m~jE;+qjX%ys{s;|S3kMAl46v1j!= zPKjG^qQ|x)f!;w+B*X%Qti^V-hfkl+qr*(5UEIP$ARHTG?@b|tBj1&581u6Y{@|k^ ztft@KGI}1@Jxtmtnyok%grHvMiKDLhOjlnrdysn+!?-Enq8(u>?RHm_P{%a-ZK3IW zUADXdA1m5ex5XCrOrv9Ej(jsJ;3iHRjuMiCj+2GyOQH@>!Q1Vk$Q&CLW^=4&h^9l)rE`4`C4x9<(8harA!;5F$8g+@~t=<3X5i9jjor`3>uXHOA3Ci?ylh%codiJ4u z=Zw>{E>VNSLC~^NBw(&fVn0wa!vV?BRL$_oZ!sAK>nAZ_lT4 zl_%x7*<*eDDZR4W@g7jG6xKt7(|O>-vl>vW>KL@eBH0%?99{+_r#RQ_EGcCAVpT#GGoyhPRw{T@%f}(Yy}yJ2I&JWv<;8HTP;G6 zo;V~=Fjiy}37B`ln^W^o!R2I&=j4?CS`w5&#__zqYkymw_GgXM?lLa3i^)2|Su5HP zGL$gvz1n!k-92dSX>4ojte*=jNa@?3X6Yq&qk~@Q3Nz_ z=muacrJ{yymF}CuCQ-b>uoIXt&WGf~P8gz(%Skl|j|g3vdX6h*ZZ!yjbClr;Bo4YG zkFL7ir>l8XDEL7}NshSJ4cE`1qPUx7o2nsbxd>Xz zkN&uSTVp$b+2QwiKJ4p}I?uF@%?J7PalXoCh%{(iC~WNLi$0GtR8=1m1~_Ibwnd#w^sgo17o*X3gB~U83%VdIH&8oOHcp zg02^=da>nh4mCJqG_Df~!L$`#iBJ7k(h-xqoYz<+joXX3#pZb{@+F(wXK6Fl&583{ zsJPPDZWfe*A*Fmdo3u5-G1vAA@S1}3wQe&Czhad`z{aZSQ-7B%hr*#mUy&z$`O#|1 zcPu<~9<+T;X&))yjgruT%$B<9(-+Pv*_NLQBD$aJGm5p^yExM9r0RO4wB*w5$NHLS z)JF7BS+~%j4qWOO>_Xn_WBu?aeX$+7M9eC;j@Q}PRbl3{QJ&#(5m3Fn)~~;D42xBK zg3KBpAMOZwgGCkVEUn<<6uqUW-@}>M-X6DhKva>co@5OXry3mR?N9i1ZZ1Zn6f9a+g_Z!uHy4dmh15(71vp`63VXAN~O8Q95j zz1n|Ng$eQHA;HTV#fA0|NlL24+n+$I2;xUd=ZDC)C8F>)Hy1eY-u~b^VU`$_=FD&W ztmG+!AR9Xmww9=;^iutLW8d#ci|Uf~b-P>B?=M-id{#~eubEt_rvw91<$W>T+ChDZ zJT-uPo}+x?CKN_%Ts#|B_IgR~EZ$be-n8xvbm0h)yfm;*p0fM}bc5C2u^S6ey+jH= zW@=sr4hm>eP8THPyT{8 z#Z%VV9z)zFehjyI8=5FmtQZ4ze*q?X zKpks3-(b;vz_xatUJTy)FvG4^@ZPSE-1bk%97YDH+w!++&_j!vUD-Abp`LViF+mY= zDaP$DZ-065;fZMhYS@rPb&xW|+qO5O`4|y~_vN#asL%{2iNbuc6KjvUFGe8Y=ZWG+ zv-0B}5Kq?n`ZyeQFb?0(KtZoSxYi>N;9rAcH5Nuj!FnGN!+y?mq)6?__VXVu4r{o8 z1R*Vl(d*88bZ^PtGYDSqQ6ElRfMd#8;nyoVcKEa8p={e;ns*-L#i--4#J1^crXkbZ zkrML{rpw$HEFiM1_V63d9dY?12^OV>{&LS z*1eMCiR{$?TrV>=a|D2d0qbS}voQ2#&~qt9^dsO?oNawT!UMqX94SI=gDa3#F4^!j z!y6QvNbGp5N6~zfCDbmh)2DwrFZ~=-{;Jrsp#T0E%Xi%_Ciu8aw3LOvl*qBt3~qYT z2qu1k*(-X0gy|)#bF)Ql$x(DU?K(*F%>!$)(C z-jNyHFE51;`3#ypg)-Ew)a0`0)NcAYeT_8fC6cg=bj?%1PuI#m-3OE9PcD**00ugk?{=?1mn z{me~mqjaugIB?E&)Av2wnw*F5u)6)c9L1;z?_5h+03&GA{ig@A^ zB$*3#!{CNd`zL$c*ZIxt?8Gys;fTu=enHsW-Rr^5S@3h#txfLSDfy_w{63s%tM&Rg z7MBhU9)yj!yf1Z{%<|rZT;Y!jai5{zE!dGht$csX1AJ0+f-r@TN+f4>DyoO&YPdiH z>B7eMvcPg1XB0hc#Ms7kKS5@nGl1N=@P5L*Za^x*qv%@vz>$k0Pt_3Jx2!#^H%$RZ zZhjkWsiuyeybRDg!=a;OTb(fP?DT-<5H+ojAywB*QAUnhkI^Ho+|LuB7D%R&F8H-Y z+SWYj!yY!;Ke^BAsB$Y+SJP^-5gzC(yMU3<|5AVTP;IEVh`Ths|1$|21wgL%;Jl$( zptWJ;(~=GG{6XfU3Zq8!@@zoUeOX=8S)vVBMoNKS9v%H~<*G@a7mKX{t*nl2tSkYj za5;%k?x|`vFMRR2+F^OIR1hiUP2@xNtJjA&G+yYp^|jh_VNMz6dZ>EWGT1`?37Rx+ zsx3ggn4}n3a<@=7etZ>KMzI-G)XY=cv31h^ot#vo4kF#iUH>FbdPDZT?`jHp1T*|#) zO_5t_Wpe*6R8^I)m^>dyp!>NMPS~6+16vbars?)54!26THta~adKCI)Op4kd;V16u z`Bps?-vF23>Dae*=1Fw{!+sjU`2;lEO;Gp0sF7(s{QafwAw^&MgA5@r8Uq%#bRN=H?t~lnZ%*skT4{WcP$e`; zOpaLq!XKvH@UiVsB;hv+Au~`11sD%k>O0YZ6d=@GCqns-W(fL3s>nRC z2u=cAw+@xNv?~TcZmF$HSjrrkDi=n6NxhJTwf5~8LYl<~ARmc9l3QIrUB7|WMmT-2 z$o@j~Q5WI)uuk=7MA%<|T6??d#K;|{lAjbiYtNWdq|SJ*=<0XFMn3_p-XQu~FcYb+ zDe6ZWMP>c1pgnrtp-dNp(`PtRrb#LznOsprbiGM|v5XaTvO};^jHT~9OI?zXb#I#C ztNp#^sH9Y6fJgGrx#)WtzDdopqr3+l*7Non)5s<1NzjCH@+V+;LY+3_Lf)|eUyt_{ zBGL7`VYiOjZ2N*W>m;6xQlr|Y8)O&T@Y`?WZN0K?#}?=yXEv+M8k*d-4eV1j(zLj_ zz)_{J`nCPTxs~h&6?C_33$3nF=C(q!XUdS%P-jMr*LB1P(6!6C86Uu)fr-yQ>(wKi5OJ`uaP_HQ+0ank*jg z3aQ`ub@|ta#UDKRTmI(evY*ljbqwng{ubm&WByH7;_K#M#q0HWmd?3GHk@a~YW9TVAVqe`I>ylsW&V6XJo^in()b~W^4%2bV}vZ-N7g3% z5xXY&#dx#s33K{^w9&Ck&=S8Bcb=TT&n`fRE=bMH=|ksMObo_&{`hU^^drVv_%=&L z&aY6oDZuM0BOr4L=H{>}nxime;UI~-{P4xMTpA{EjT>sRjXdhPJBZ-SoT_>=teA4) zhNm+i2GdS*%?)p5)FaaB;FE>6;pRDp%@8$-AapP)6fbkjjGCA*kFfC6kY^gMpKO?h zMH&yo%{jQpgFRr-Q)SgFNgsLebtB=;B)d%UNuK3y2Om4%lSm0@d#t;#g9&8c-7ykJr91%SDVj*z@2>gF5t)cv z=mCEGNHrIj$6d%CQ%%zHOf(f}3L-%mDxbe|j(fUS&K5aT#7fMqK~PSZWe=%*DU|*_ zru+U!8hR`Sy>tJB8kefXJw&4mENbvC09432pyhhG=$7*j?R4+1<%=CYPR&=Zg^klo z#xC&poVL1rZub|kE96o*583OFB)kl!0Pb6ybZk0yaIk-qha1T+ z`GrZ?jP(X=%h}6gq*U%JJJ?>LGMLgCoCho}A*;MW4A zk?KZ@{h)S^qa5P#wfxGe_WrfSb0a@wUX-K||Hhv;{;`zF_(f=7)I>;N;uC^z6A%x4 zAr9CR{OKbRWkxuE1b5yJ!Yqy-7|UBv0O1W;zi)4<+{Sw5d6(y4(kxmJcn13mShCHz zO9zeT@4rPlZef9o{8Gh_qo5>HG=LvZD7PcOE9?fESh_c zU!MDWK{5Kn9=u-cO(DU_jQ(EIOjuxJrd)2MP4@1(I1lLuk!KXzkh@{$Nd|bXH}4~u z7vrCyeq@*L<1`&Xugh&G1|akHAFRgsb5pHwppF8@?<>cTr4yHVuC!|-3Wcs{Vk-%X z`&uJnf+M}zZke^g4=0za1#Q_<{BRcS$2TeM_y%%eP9JA1eQu;h0w}`I zjzu5kr2}$ElVyUR_cBuqJe*1r-sn}6orh(Lp%j;8oQFa;-2*v};~!LTNymIQG=Jg^ z`K5`~QFbSLmOuOuXkPq3B_xUi`OAb|3)(wnb+sM7*^ygmEvMPwzS%TLr7@tIPl{9T zfP5}JJzqXlcl#WP&nsl6`J3Hw>>oLij~e+BIXoVD%56DEQP7sn!V!<0U6vxM*Bq(D zzkiAscti}6SN5wa%sDtV^CmA8gx1LYNMFRqp7lUj@d@dF`y;C>{g`!PXB3g?@hX!0 zYQY;H(7}(S@aLj~A;NsOn%kql; z_X^83CZ15ZPkD36y-5+Xq-W;gPHG&vlqZ7o_IA%UdbAE<9DI^5#_}3=J{X9lh6Vl& zMF(A1QD{D2%eqQuTN6gVv4CAcya92$duA>rNY)qi`@J~iq@R53T*RQVl>wpdm_8C| zB~pY-BhW=)cuUNk_tnDAcfqqkkLTjWe?s5MLR)SXTr3L zt^s+%a2N%5%q)27tnYeA=GQ2DkL!8X|eW(Fm^hf!@ppzjy!KcPybj zd;$={eAu?Vc<#M+F;)+|;>W55CywUeAOZZLAj?mhSY%b+I<)dOFV3t;!Wkv`k)oy6D{dV$4kc~3YX!e(V-;_ zxW!V@4z{jqw?M>ILIV3ubgUN_%0_XE6!ox4B8i1o(Gw6VJc;%KCbw8ql4=_jmC)k- zs}p1;Y2gOrG#9{T$g8{PzN!ghiyTHFxb5GY81nktWHw&~=z_x)TxA7X5eonqX^!rIsgwTJPbXC4n zTUQkU)b8;FkLF?IQh7^rlRrI_61a=3Y}`V{)VjBYwn1*;XYb-=2=x3B6>Uab^wO{nRYKj)C^Ms~fBgY~`+@UFoO`O77pL7DJ1f_Wm4;8AnEF4kr8{jaqN68 zuiy4snOTCNsN0z}&>7X75~CQ2BElz$u_4=BLkg(^1%EygK&Juqqq=8x?~?}K3@@4T z1Tmmkw2Ijy)DtaGk)i`B3^OMnrutX(Pt!!v%Mrn9dj$p*>CTm~#bUQO>*aAj+A$)Z zqdjb>Hfv46mn&O07h5Rh&#Ur+^{02ynQf^($IpzVR+I)rVS@H+4GLkG6;&-w`&9O3 zj|~lk@yTY=gj{FRVYG?A0B7|2H;~L%Pn{(O_imNFry~gpO**WG=o_eALn*HcSNmOk z10yGuc{!pN7^0;fy^@HsCt=z4aG-UJwa5X`dVVZHnt41aCwHdP7H+hub8&?kYzpTC zT@y9rnemZmsaxl{e;RJ)M&>Gwm=_dMEuIB}8dp{R!r8j=JPo2Wj2jnKfg_(!MKFTh z3NXo%uc+g|2UKh-Egm%78!=px*H?hGJ#6q-a%l5Gv7fEPp=~fbCQpBB zW47F^gA(YbWw7)^ZtuW{eOcBN5%l_jI7Aqr6Yq7b`o$LdRVlk41GzBxwkK@rNwC>; zLw8E6$F=DH3S=OGhP#Z(A{``bF!=Df^P_u4}*^wxFcoHu+ zS94@Y?HXt3vwQX#R_a?WG=x zp6Db3M*jTd4+3AR4b9c4Tg<+6jERUwz9hZEe_Hcmql{sQ@_(CO4Sxref85zxZ0#** z2(xJ&dhl=^5Q%=Vac(&4Y_htuU!%d>EwQb37qtSdbl_&O&X*9tTkB8nM+km$?1n%> zugv%o+NS+!Vz#z>Uec5RSL}vsCkiI~3lP;xZCyq)?8k599hnD+;>UV@HDOH6$~N4# zwENnmF`sIvkR8J^$h01aYb#=%^@nHeE^`Uq&79)BE+^}rVPU4ui=(A*Kc1dPx}0}J zKI952QPuIpxC_K@kG&xsDrgQ{WJMsyJS{MSz4(<+`Vrka8dhWu9Ih;BmaBiB(=!?i z-u(%{ZAvP1a~F)k!iYNCdZej`s#cW-xmK+_P=&)Z&zN5m!H-LRekzZ?lQcy}96Tfd zYUFq8ttl5UOm?5v3*uy~G+3Hx$3IdbH3VZ!;mV~1xhs9LzD_=E$pw&fQYYxM%KK*S z<==rLpKW&S+-OU->8%Gngm*YqPi*gISzUL2uom6B%JHJLkhEKjc)|;)$6B`Dxq}=} zuPZZk)5P|%D%}{+I}MuoD_AE^23L)C?W_*|U#A@DuYBSr+czX%>_L?ZORaIn2nVdW zv>uPVqG)rHy*l^riU`axTiRmA%=Zw(g1Q;i`>9sn`yJEgSrnX`+{T`(Plq{uQP(9b zIxBU@T}<0d>REO;<~=e;Q?0JmO)|9{(E;^iM0S%WApHK}>>gKA*UC{+zngM-WGZl3 zoF_USp*H;01NwUiShoYLHpRh2K|be16Ug4ah-NxaJL`WH+bdKK2oOezHgBx@w%3^_ z1Wv?`zj!{^#22vs1ymX4mzip4*SQ9=P&Za^&C>Y!(W-UgpemZ91?s!Ta+MV$ua`@q zX#NS;QEY0UOpcz~-tuqGrJsvfr4-2z0|QCO1edXtj|KznGmF00em=4mx)aTS0TIb5 z{jF}h3hy#X!wiTiB^b7T(Uk01=Fu*_+Wp=JmSJf^jsR#vUb+`>f$i z#pa4W>2;K|b0F;VN9B=aL^bBR<^%tVaw&gyw<6JU4>Cp~`|qoROid$j);ty0ZKoa6 zQ}y+okyeP1t~BJVBF6+$4R_L52xo{u01g&dG;WZPM1-n^a zuTWecv}ZUDHIg(%gsV7^i3G|SEwaeq{_4N$=M`qQohRwO{aHTxoE4+%{>%7MBqExSI;N1ID zvXD%WhLXJwrQC%mxm@fde8WLCkjQ#wTaqHqwTb^oyi2dBx4Onyi1d62^sejsX&+vX6JQMw>la=rNa;yB5P zfPYHob?DBOr_FehY-Jb#X9qsr>xvaU^q#GSXsEpkYdTyOC9p@{QaKeAw@SHHL5j$= z)zeiuSbT|g<}~DA32FX?LQ@twKn?+F6V0ACbS#` zch5{?ntUE2v39|CjkQ< zdZCFy0Z?#%I_>@tTJF`g8?1303V{2wfAy<@znSiJDW-PGEsXs*{*~(=2(7kgSJy;; zt+)ses8#;}>(XYiJ5^sh#ZlyBTiCOG42}o#jMmn8HIJqZ`qr&6{sx8sf89;T^^&zo zUBu^{V;Q6}o!fP+InG$tz2gD6_pE7KlR;5AABdBAJ_FU8KYAdan*m=oSI~bI=JNap zbff-SO%neA?n8OO7$@Xx8Rq6n^E;`$PKI(ar~!qa73v(!O(yW)}C6 zNAs_u=5V`*pv4b!dRsH+TbX2eO1rx(WU7W@LX7ktm0qTmOWZh7l33sY?^i8s)5>Y? zORB2?02%!K1v3<`M{B0sT>!uYkNfYy{5@*?x?DuZy5t?I0ReHidm5T+h1@=F6+mH( zV0I(gv*fxvfX^y_It%4Uu-Z&2U6NhGl#^2^J>}j#SCe)o8A_E_dOB z5~tp!g5z@J@s?vi?1-Am(VJ?AR`e#In@(T}i!29j^`!eHUT_bsLdDg7_sp~jq+>F& zoDfIUVwqcpNab9<-1h`z8iZQKDIsA1dWOXW@|jiXJ_54Rue1x+nH_JTW&zQ$xp9@q z_BpOFX>lmh5eEr{>x_XSQiIeSxNtQtME=y^uA{!^&5n zW5D_jmFlv1Pfd~+x%)B_Q_YO1+k!?%O7Z^n@7hZXpATuSdGjqI&;9TPCXFfUdz^lw z;JZeKS*;@~C5*TPlY&Nnoq3p6V}K58-|Un&oo$}cg*o=)`d5q3WRYt(mb1H}N~?b=-Ld$K;mt&aTeub5fF@0c^{<_1rO1ir zXVr1dWpIJTyfP{1$Uz2_+v%_Ot#+Zgyp~a%qlRDvHrk}dgC()_HPJn-_)Ma1ys<3~0H}gE zl~}V!+>XpWeGO|$lbgHI66BV{C)9q$b|Oh47LK@Ql}IYs{w@du-nhL+Ynb&J;JA$l zWjlBUWJBC`;PfK8ElNiLqK;jMzEChe#2V%ACA;#Cw2Z8o{o5*ZZ>MbgS8`TXMorwx z({!0NO<^r`>xom$ZPFG8DIaos99OKRxYBH*OL#*^7yxIF#}z}v`doSjk>=eX7n3My zBOOz?T>1*zE-&p*mg)-(dkVCyV%^yUH|ueA%K%8m0qIV<&j7aE=CUp={>L)NvJJwv z;kb%>Wp`!9IW5L2_9V%GqTG}Z_lN6Flnzncka}mDvv9UfL&oH=IRz@Tfuus+!S71o zkx1#+8l~KJ7Ky%jNLaw>pHu7wdp?DtTljlRj?w^G+XiU>=12AVdV5wsf&4yn_}@{z zge}~P79qPOV;~&ku{GD1=dLTt#n$JSDtaTfg*z*?sM!jnZ12cXk^dG6rLJ!m=VD|o-@?U0N&_(Z=5`#IXi;9S+_zj1$kjeT5v85a6cduKI3#^-R`_ zlZP&gOYCO&!&DlFi7t}PC63Yt`#rpBepHQqQT&Onapq)ZW?jL06WX?Q9WLQz^R(bD zm5FAJSQ!xg4hs)VS21%tJI4S762pGsf(Shi(z)v4>QQ{NRxTCcRdYeyu`3IS2Ly!~ z0QahsKIRfb! zeLqmvFJraS?IydpCjvN-xyQet_cWcFzi?Mh&!O&f-8_;^{$WEg8IWKWKAGmA);=EVm(abkM*&A4d`d8t5qHEh-*Di1R{}(RImqOk^{-uOb3FshR&w$J&q2Tg zprs}R8GYyT;L)g|8NdRV@6;26Y0iX`- zozBdAl55_69a?>l!8QXJGVhfBEPtS{230IZ4oTx4we%FubUy_tSj(GxL|uwGV}J-B zolKpT&7iLAB5F@+mq`TH{(RR`$lJ2RX*gVCz5uH6>bI87zUEjYi*^*llHZ6m$3KX4 z`+%u^q`kg)Rxvb9_#QD<((=Yt^B|s8euOZ{73R*Yo4Y-FbYP=nsT$&3bCxl%7&xX) zI&LfGAU$wc16CX0y%ZTvAe4~NinK7dzRxxp(pa-Ez)d$^--y37GS)q7EmoJ%4{g!6dEa55>0c_58i zHrUEOU56O<#W-G>6Ze5gJx6K?86g7#60sTjsv)A`kimoya;N(>9+StvS{H7%aCz`amejB?1l4F3QbKi0h4 zQ1Hx}=fq3hJ|^2F_;~ZiNl<^GuUdglk7DfX4aDqTL!LJg(0Tz}H;6Rb+kX&8sVoz$ zGeU^7MmZ7&JPdLU4P>g_I~vucWRpDm?0RC5p~2^=YDdxS=Tg?PXYtK#x5IxC$7uI6 zCA?88?mL{E55QI>)|0Afkqd&Gp#K1#@Gx}zantdw8hshcn(B7L9w{$u%9sF{^|n#M}b#<6!-C!ObL*X+)# zHsC`yL0f0UH(F(jPc_*Q#<^m{APVM^;acHWKJ^!h46^uM^&v)QDfd)l;+Hj+g+*RT z5?%ab)BZ(@;LJ%oiGbi$Gx2`F=l7|f`rvEGm16RVE)|C)9x95I-}lBSQ*P&5!bMwM z&u2duZ1~#i{{VhXH~tpwvpS+S=3|9bgpyCCc?xW13Jys)z~?pT+6J9%rdnE;Hm`cB zPRRgS6^}Sv@J(q+Im*U(iZG=V?{?0L{{UH=Q@4gInGL?<${4;}k;ZUy`d1ICPjjZn z_smZj!nx#i;OEf#)!k2AkuDk;Y%URzvz8da`c{0J)xE+pA>oGEZKWJA^~d!U-xXzK zdF|biN#ffLLNxx>hDDb<8hC);8;-~O-;d{4G>Z{&ty@YNE-cjfzH*P0kR3*mSB%wnU|6V|MBR zHsZ_~Sac@2=zPlyri5o^az2&6^K`*J^`EIjhF9{Waf}j!IIR(rC379zBr=V_brlt| zAVP$OP2FpbO?W5hyhp{8FkJi00x92(BJr7#*aafysvH6{JX{{U+OvM%+TD|I0+>=zx8&+^4!=KAVtqK*58w}}ZY`qvY!c#bVL z@wPfEk&wkfBk5V1zltRC+w7@vaD8Klh5XwU<~pV2*0-ts=2ip_4t8Mjaw}RmX;*!b zol5GOJLmBoz2iI2cJlF-P~4I3eJhvMwS%fSSlwe-@*-6qD*FEb`s$~X3=uq5cB>Ce9Tb?__PruXhW9g%5jo`5t|IOsZ+n675?C@RJsPSi|FYJwdH0HK<$% zg^Y?g*rVoA=tXkZlPSz-;n1A@K9xyJ1_en3bj5oVGYXV6TBFOag@sm|=z1-OiLMcs z>^dtvzusU;HKTc>>OLjgaebu6r%8}a-r$A3nNQySb;H>BgTxR49kt}BqoHM1Bk?0O z)A(ddeXqcES){#|Rgnpu%K?;OfDZz?s^%2x#?poTOex_d30@j{T;*=G>xR|8vmP?D zs}=Sq)O%x^^a(6wv+#k6$pkUmTls|K64G^IewCA>U+K2jsQQWX9A{wzagU5+j@9d4 z9Pnw^Vv0-18Z9J^!Z!s>!_x;CcJd)Lz*9`MGI;cYxwX!dCYc=?;F$bDJ5lUjE0LumwXK@vw4 zY7r3^0YKmYI|`RSO5ETRoN_6LJ-GVRnfg*3ccjLdUTBhdnki#fje!h~poRyiAn-+b zKZ`yG=vvDew0S?V?s+~~P4KZk*~N&@3SBP8b_qtb$hMlNbn|qjKD)5x$PCH|_p&WBb&qmimp#A-%fxRu>Oh;#GS*i;U&;WJ>yccQ{{X`M4MV&sJY;lLkl=kQl2~i(&e-Cux;PuX zRs?~bYe@`;u4M>~?t9mlYElVxD~C{kGm-*y{43G?LEz&XGAxqq9WqUESAH6v6Z^K@ zbJ4i1Xw&v>wquB>>Lilb!s|s{zSG{tA}+y;=YiW8s#VZ&wA0iEM>MR5b#Lr(xGv+hzBFF^%T-`MF7~8QBFb5 zeJBwoFaY%HQ=C%8~noELP zu?D&?1ZzV{@e)YV1d2$?J2Ak)9Z$DPo#VfnSgqoXvMt{QpC5Go9+l_b57uNt+I+JWw{65W zGE@%3+nU$$EzAk0#i-id{i5O-S9}!<1*>Pr<0YH-& ze0tZhc*n##MyYLapm~n>+Y0~(%A^eQ?_Mmi6C|;)B|?@SnWX!TZWAd2Hs`En}i>oKAPIn83_V?&_7*RCCZd2eGW^?Ot&^TlNyUQUA2 z!G4ygKc!|*szm+G7L51vG*vma{6$SI)s4iUZxP8D1Iu;KJqA5%F7DRZBxjOrNICNZ zDxBA$PA)4(Whq~9o>Yn-BITvHw_X7!4;c1tzlS2Tf3YRhZlW4&%X1qxJjeN3G5*-? z_2_$238j+aJ5syeRtXf5o!g1*K8Cc zA{1L24!H{rvM>a7=B$lM*~iN$C)2G}lSQ|gFu`{ZYD+H)-d$WQb`Y0y*B`^~PioCp zU34weeFk4%-{1=f;$nCT$(Cd6E1>X?g0*Y=<-gbAdsty`%L{-SQaJ-1m)AXOvhcr! zmqLV3cv5TkPzwQ-jCwD>y>7k68|MTM?^hLkX7P^Q&FN5kvHRJ0+5jB@=}i{^oMQ*j z(}atR<^V!7B4`$%#PLeda=AN%I8 za%*;UD5tSAvXRLCwNhBE->fRf{n}S~@s%M*s{lWpW!`EL#gQ(s5dQ#jAQ`_PD=9nf zTvdnMMFJ_?YKnb!f`65Ay0z831espmA3(!d5Imw^N8g?D#*YV9-#*cKDjl^ z$9ZpJvLZMnu`J98W6z~#YWMG`YBF3imn7gZ?~e6;%TBn}=Z<^n+hM@Vbt*mkSJPoB zDp1>Ar-@plob=eGKC3&(Ol`Du81L^;t*zzb?<(z{tku~`ZDk1C8pby8r)V5hQbvja zE^vJXb*SB4vneYa{8}L#%Gvf6dM!RUz#9M_wR3tYI4n=SHJvu|xksoOtrSw&RuDju zjvYZAs{a6oM6-C8PmbC)nnssuB63L}_N>ty7cQxXX-fr9ElDg%z!Q6A*50GeekN^K#DCxTk`I;JO)^>3~(R)nZ5dYde^6B znViv_^fk{ZH>)d@esM{kaY>WwO=DAVNO5|MzDLSxHjn}V`AO-XhN6&VvKXWGJ&j9c zq)KI6?U72KUeu#^RA5J;s%>=faXe#h_k}7Hljed{SV<3!s zRZ}ajRmUQudxrp&0Fl_y%p8Ti+33Qk-su`{q$C%1R`Ro`R&yBMG1CpyW4ARD>JgK+ zbuqqv@E8>HIhp05JnOv;%-xK&s??jGkttQTU%YCGzQ7L%?WA5N`dGB5|Y?7o1 zvng3X`P7UM-N7A>YwEb~b_|xsVa<5W$HQ$(Zx-C!A!XCdzHi#(Im$84J=>_NqX@;e zx|3E?no3;_pM?5bLuGlXy5r7a3}p9JR0$^KD_)QecyPnfk13cH9#Dkbsv^%$LiL`6&Ji}N2c38jQ6c}7E~m8A#zVR8Rv@2 zqa;$cJ7*sB$OKZ%6m=&Q_KlQsMg=Xf&CFx{s****pQR$=lUFLyMHI*ic<2Q%Q-@jx ziC2M79lHu*u^l?pfQZKCId4iumJqB%KM6~R=VnOWSN`O&c+UVC%Y0aC+ktA5w7gFVL9cJ->aekp=&JVmIqZ3w~4 zo}#E*_>RX>oTatGyAR+UaqUxdW@Kx+J?5R`+ndXqX=07yc~#hajyB+6^sX?lOO8$p zN1XasrFf&ncNaQDIw`n!5A!f!35^H8TJxPsB9Xh0rC4D|sd8HlElfB2$Lm@!Yd3I3 z9BTgnB5(#eiq2&!*d!C}K_!nrPAgWER%9mAx;k5b5vu?sR#yK2WMox_@fEOQ32cvJ zT%x-u!ki4|uMINs>>lvPx+OVctP3uCk=GTBsKWY{?He|~;Ucxk$$$Iiw=|t@=le2a zWbG7J5^SG584-B{>73V`gM*&6*+nne?4gXJrqu+6>~YWYs)ZQE-3yxKkF+tKX+gt* zfZyFGy=BRLi2$GG%VY6je#yr<`%ErFGUuyMzKSi_CEyPeKL56lbvGqRn7Kx-mqUq7vLxv;f zl0rF=`;+}EL7c7>sbxHI&MSg#H*SYaZ@FGu7};4Ok>Pge%Gug}ha#hUX=8u^ncaTy zI8xr@x8YX8?p!Nw+sRXud!W5N#SA$pOCN~J+Wc1HQ z`WlH|B_|m}+*ch~yV<)PRN7jlbD5h_f+x$xJ+pUc_#cSrNHtq`P=B_i4W zZ)X4SR}7;lzUb|PjMmm;=9fq(Hy7gJk7aGysy8~dt(+z8 zZDWB40oiiL=sTJ};U2cPY`DoB;CHB{L{FH*xtH#YX9L=z{y^@G4MO)*)l{L9OIuU- zlpH&cayk!c;XGS=7M*4G8+T<@*b~8w_7x(E4XJ{Ln=qs^ELf~!#ZvZc9&eN=I z8ZDkn!!Y2fIoKrMND|n4b03 z{5tS;uZwk3{zDBm*n>Qn#_vQ2)c5oi4tiXCwm1F^_(E&>lf_!4vd3>3BE~jeUAh@Q zgP`~4wR?$d)C!T>!yHhe#}sN6RR#j6-~c@;UI+wqHOVP9?#oD0PB^BC6^;qPr5>WA zVz0v-9>$f%ks)FSJdahVowB5Zk?J~%WA9F&u=QS|qh=rn&f)#>!5>jl*wPyFyle9) zmNVUZQY6VLA1U`J?0T&sak%Zkd}Oa@9-^~vQPsZHXyqneasA=wY8z;@u3!670wa`V zTcPx*rn4E}3za9*s=;#xuWmphE0z1p(+Ae0j^=A;Fw6)5<3HA=wD&11Y);pga4A<* znMbe6!lO%@d%#RHJW`IK`h!*H)NQ=Nk~mf!k3;k|QfOeD`H)H#$8c#}b}!8&iZ;xF zhxdh3lJPCZ;7;Dlnt)tNidI)F1yGiBX)^o@mm-ohVZV~%O~a_@qqT2YP4<{^-A_YY zUCoWdAi1%Xw+=y5kUfa4Zw>1=&v=%%`gDdy`9$%3-}n&iS4vivq|IJB9Y>Q{nsf~& zp{!ivA+8b?`bK~s{CS$UCv>)gN#E{9rH{TlQ#9K)wy=sh2k%Z^U-#ISZ`7LXvCYgj zC>>Pp+pvxXInVjURlJ0u zCjGyA&v5hEOd)0^jBGw!`(vehHO7`tLMs;Q zPdIES%{8G-DQUZbj6T`jf1vOsPk z!v$fxetEz>v&iNb^>()GZ}a_H&~H#^}#lW39;q zJ-d(i0TtX$Z0S5CGXNz~!ReEa>seDpbgXeSfpPuwoX{>yY|dM1uilLQbdJbzu}H_M z&2CC>6!CL6{q$=x_F34fODnh>fXAgWB^wm!>NutW%|DauC?yZvtJANoIaIO9rmBKC z8TYFQyH|6NqJn@f;nxC~&y%$P&#f!FO4|@|>rx`e3gzE(dQdT9MpZ0ah3-k>qI<31 zDuc9lsZPs+3raQ)k*GXV{>>D)3xz(^fT)P_9yuMwT8=WzB9AA7xM#OYl2~E0QHCuk z=mm6s8Ss2M<&)gqn}tH#V=%B8$6VwZ7?N*elK#rO?H+!Wx#4{R*6#jMs|h29H3dd? zhivpG>0Pux4K&`O^Hg8>5+CVP{{X_Y)9?rOmZ^`T8XGA=r+XHj;=co;fJBTq3ObYc^r)r#M0%P$h2xS&1<^+Ub_dkryPY?}*3YTi z+|0q^goWBk!l>`{tM|GkvAmfb&yoh?AQ;?9?O8l!JKc5M6nma$ePWJQdo*y20chA@ zq5l91dA;%(p=K?HB~MY0O7zz7=AA4dr<-79##5c%huXZaR<{u9cM$x>MUjI7G2Xf` zxQ}af*c!6A1Vv*V^{xK^wXBQ0bW%c&)wF3Uv3%Pc zm9a9S*Czh}dAnuezZB`w0NUfs<2*_OuRX9vPC4&g0{m5y*aI$}vyuQMB@woLf#SUC z=I1agf=Tc7tee|`Bqa6VL%~odO?NUE+^z+RUseHPD{7dr+a!V7ACk4;GKT7a@JIA`ln4x5t-bSQgNW?fF z?~cEYYq-#UC21O~d3uOTc@G25!k^Ef{VR^H9<4pq*OA>v7fPr?A~}a3uK|axQ)v{*06WA!E(DLP{53X>OGBlHE6iIbv+t&Q$}OkDlf>)!1OG0Sr?!pkVj63(z=5P zMDqCK?&r00J|K0sw`m9fs3g}sqhnrc>Urmh_4#y(VVcoVrxFc?hCpFl#;}7|)}YlS zSmJXMnU`o>$TCUe@U7eRhW6DKeY@UMCnuC3U>-fIo0Cu3BP@to20wduLtjOKprK7R zd6=4)c0{I2K_DwHnI~?&=m)KH&ZV?PsP#GR55kvTFVv&4({QhlWG=i!x#gSN`Pb9FAJA;P zGooARb_s}K9I+oWvFgq4YOjSnC8zjHPHSr<25C;|=V8o^>Fft#Tckhtg<~mSL8YkD zG6+28qmDxxY<`EKsdm3`57L_OmmG?Ra+QZL$2mAX2&Onph=q-10N03B+nacgfmPdYKZ56DkdC;O(8 zPoUFXN>c@zITZ^dFC#r;Z}2r*-sIjuWX2{#wp1U^q`O&QaOW)Xj!*pbkGT4Z$rBq5 zU7b#2lb@|*-JYhYU1(O+5=%SJw8Bc~?_#E(S(w|b#@()=#w1kYNYAMC6|H$}m$u$y zSR{<%xp_?beB0v(Z$?aJrKX>eN!LJy&; z-NBHt}teL)o9I55w;t_zgc&ziTMKjYCM;#(Dwk zSlUmCEuVBnPn4(Sb?!ZJRdp>$U0zDSW)Qnov)Z?nc=FvnNT-LBotn_ui&4>;^Qhyebe-;i|GcT2uz3UtJk74$BR7Q;>OXZg><+E}?&q3`@y10pBEx#mlh3adm zY;yH0PK1NUZq<=I0OtddT90&|Sv!VE$Wn35XI$Mxk}3jOK?Jc~ z90iTG$^PLypTOY$bPngwa=i9T5KN7gTx>Z5>MK+GNJ|=qyUC2>U|*EgQ1S&4yeb%s z2W9rE$4;6U%&7Z`IoeMjjd?CNLoPg0Pb!v+da>LH>?+`UmB6`>M2f=z4aELks+ONH zhDP(1u-_`}Z0*metAgSrzF5)_;P9k%CbE)g>VSghUY`XSitTTdo+?MwCNZVM$rP^G zA=l<0`c=6kNs!{kHJNfw;0{N6sU6%gvZBU|8}QMR4nGPxV%DfESxsI~vXluL?#nR^ zj((K;O-lOaMv6$4ZMNqufC4>@WJx8Eyk{{38n6za@O|pc(k;`0Z6Oih0FBTJl^E?{ zG_I$L*$Lj;)jm~KI)N`dsjvm+^CxCpro%8&Rdr!-mOG-#IA}MjvY^NRP)Yi%H^RL01?J|SF)dD znT?%e4Z0(Me}tYz5Hzg9D4c@F*8-w&Ltu;^^dm4oE_oQ@g(eLh#*JwWhLrb6cgr9n zf2aqywttmfS@v409;d0QX_)kwVJ(9ih1={fN&M;=<~xG!!mz+T_3A4^DI>+5>P@%Y zxi!P4@U5JD*!3OimCT{7nb7b-BC1PvN?`v0t&gbw^7pD@@Gj_q`ClX-V_CTx!js(9 z)2H2Ns!ni76?8rrkbSG3vAGEwDIgHo#av6gH{Rl^GtiopS2Hzan0&)L<24I7E1nOr ztl2K}AUVK1Ql-xrC#MFjRJk6@mt$Vj#2U7ULuvYj)DaFqhEg&39tifube10(JWpt= z_L%H7SQo22MmY!6r+>n@FSMx2xgdIu!lGzYx0waXt9pZjT=gkJd!w$ES;ML7TEE0^ z4EV>wm)dpI6WQvXQF~;SccVzZFag)5agIRiQG6=+fj#rZ;hjylF;n&_$>)KeEwj{P zI6l~~F?-2v$NEHRDjt%mfWz?>vEZ0&EUs>Kmy!I-0p`QlX9x2&=3=n0QFc8%J|a#^ z=egbL_gY=Uq{Ga#-SV;f$~`mbTz8IqNIuUUwYDBfmwtb`y|eZHmC+4i*HEw*5}ZYF zi#oCB6_jWC*OOjN_M7RX+z4hRH+Ed{ULF>oHD7yVdex_Bd%Z~X$nT`Md%MC}LXtKg zoRQN4tjTeAaRMJOvBpkE3GG!@?87kG=Mr)WjNsz5VV2S33aC-LnHYe6xUX?lN!;_? zZmeg@m&^jAj=Yr^%|?)1tB0AGC?IaeMRc&jT1Ik+$_5n1*~LS5ZiY5Oj6mdr)cq;@ zbo4>&+SJdyp5w^gp;=R}1E;lnr^DSB`#Zu`(y|Dy?-Wld+sOy+gYArQ?_LvQs95QD ze{8t5Ym0kTGMF3X60jgBC$PyG_NVH;J@IFV<&sJD7PZr}fo)|cl(_6S4oLiJV@b+Z ziY9dwRogz1y3};-J+XbN+grC?vdpM|B5IoWx5GuuYvL_EoPP+rO#c8n`HaPBBMDyK zKQNz|$_$6C1j zkHVws(&+AAw~E~#RC&|(Dt*-d07^XheK!%*V!TGn;x~x&#bs?u-%(5h5V;5$x##8p zX1hHb;&p}TShbsFuyO__Sp&vT*Q(ZXqf#pDsMUm>k=@5~RBdoc{{U*H4-|`o$JVLG zEOWxLLn6%+467?-5$t=?oZ#)}+OkqLX%D%Kv}H)n4poO^T&}GVU9O|<@${~R=5ZTK z0m~>kQaaX3+{3I``O&FSB8LjY1a?2lqG>c#TTH8_Uz8geiNNNyNQUHZ2&KJim2zya z0B(n<9R77{?XMQr4Y+pNaf;15D>B;ZjSHP9N)MO=XWOM>-f0%r(6h1t2RN&zT#&BP z0Qb#u-X_=0ya6{BnYkf(+P=S9e5t(#q|t|{c#1y?`BKP@ndjUiRmpT5F&y!ZD-T-n z9fqx~E%MKCZN4=CT*hO@I|I=A^{!{dI<%L*B6fe7ZIxnBM+3Rz>P>XEKNB}3P~Yj% zO&&1Y4hYAw;MVf8RThaM)c-83{96s3_fzSKhq29$Pz|g}&op;P8Hx zmnxK6Pg3VjbYz^?jP{Ftb*-#pkq%3Ir|Vn#J>>T&R%XP2ZU?Xx=6XMftu11VOvuFc zZg6X)()>>g>WUl{wj&JP1`Tn_GEZZsH6r?*(qAG*&dwSx_&5xGy=fx6Yx}gkj33<{ zs9b&1-2Ho2L(lf<)*CQ{F!@_=%8T8rpoS>z<5-XYF~IfpHPMEPdcBNsGf7!)$1T~o zP3b1_k~ag8al!p+DI{eXIRn;~IHQB)i9$r90yF?9ImtNn=BYR(U5YevNy`HJuzC+} zfLC?SB!!ku!vOWFGrIyoCZu9jLY5?fjmN2~b4iY;BifN^6z(KoFkF-C(y^}O&Q4GG z)}`B^O~8U{0`cW!Zz#`b!Mc@zxF;FHbo#Y8nYLrh+nnN~&$R^P(p4O--P~|&Q-rQM zHYz$hP!}f`G6TuT^r?eIHa~^EDxDMs%R5zn`qX*Krvw_VU5LRvQ)mm8jEwyoXgz3D zW%+{>^s2##H7YSY2P;5ay$tsRa@$dP`Og*YmbcduTfrcd?P%1I*Qg!q!Nj9@F2%9y z(!Ga5y^8lowvr~?6#_T20fH$=+LOJ__S9}u8yVQ1+p&s~H5;$ukgu?8)^_9Dtv_`^ zsiZY4k^cZKoAahksNBm7#Aad{N|B#YR0Gp@{KA-6(p6e~n1*F?V$mzIfZ7l{8N&wa=cRkIEH>bM^GD4pfLQM{{p*5?>&!!-MkU zCZLiVpbU}Vvat);p!!rS*O1^ks;N07bDDDBJ2S?POl&r>JfEnk_DE?GHlrq*

O~hTUkd%_S3A&$pw-3~HNd1TO6_#jUVq^Z89Ittleq|R z$yrRN@S22cFQdS&$HJuJMK)p62*-V>tV+xln>%G6>#U{=V+uecR5H5<GNZX#r}XgdwYP|9+# z9bc8W*UqP*lW$ws%NgDD#Fysex0C(zsFCk$$-C(`w#whUE4I&M?k8VCdy&+oilZ0THE#}s zms-Vq%uYI)P){3wr#dh-oGh{t@61@yek%XNRK#=kyEp7v^%~-Eh0!0$jy6ESwaCDm z^yQS)1W@E-^OL}Hw#1%VbyZ~dIw}wYl5fgkI-<;SksZd}V)Uh6WQ%U08 z6oq!L+C^$_s$hI;oy>$Pg`dt@q%u6-b%!O3;OJM76VE!`uE?j>vJWYnAK-r6-G-vO zNGi<%u859@oRPZvvv(Z&`{bxnVOKSrsZ2iHH?%JsQS2soP5A>{IWURLjeh@*+1~#RwcEx-c{f<{@_H=p zMKpoF%yrLDwPpnlN7W5RZdn!&D@<1cSb5zuG`LEux+3#kS3{VER&&fAh8`%68`woi zDC;R363dV$%I^R*@&YUWl7{)S8l3;gF&+W=<8i6fAMaEi3G}AgldO=K#{ngOtq(E< zn@^pY%JtbO06jzs7}@UWz@ILS(ehbc^et-(;2Gss3z{JhK&a9S0VG;oOwO%>P4i+_ zzslNarCnZN^$%GgMg}v7MmQQfyj8OJ6vd?GDZ_E_DCCXEwwPXdUtj=%5AmAPUf=UI zzYH7Oix_f6H7oKe)r4ynGrqT%zB)y=_iG7l-mCK)L+ktzpZuPHkBQ99vrqJlJ+`W&WGidqhV9|a0+r6 zsM)ujTy)_P3|M}8fgkhYiMl;3HOjj_7y={&n|m;H%7 z4u!Xhs;W22wcE#)5uYTWRU&x)>Jw&pYX=uZvx}TaF~?g}6zp~tz;jF_i~gn*SIx2H z-FncY+<0Hu1Tvhg8UV2vqxa*QOf^4!wLi~7XZ7R}nopP#EAnC0p73?zo+!R^eQk(z zB7Q0BnvhwTn?^u~EO#Xr#3 zzzddE6%4Sw#d0M2F5z?v($rNp%WwEP^gX^sVT#iPiJPIuWu!5vn(FZb_^ym+AA|Ek zwY)To!kS^tG9wh>`9v52I%#p}?j{RY=oMgtBP+3*tuiqUZL)&kX_W2-^J(b zf_+;d>dB@q!bj;1Q>OVByot8!IVbnE8|E(L8TsP;cd8?<;;bL?^*@E<%1HIs6DhmA zu;_<%n~DK-qLLp|q!kumNnp0_Z(UN51hf39Qv-vwFWo!U4x(mmvs%^rJun+PX?5xh zXKBEVAoe*xW!Gz^&PePa2OS82Vd7n>^JZ_L-vMGTPifu7zJs>SYRqO}jqyfw)h?{r zLQA*C7KnhFGN9@Ek0`6(nHRrkTfcJ+q|%#)ej*k)X!lt)M*9$wCt3E`z{pwdY0TIE16|%+t~+w$QaSEDNaS2Fx!%80?sBtx^$b?ZdbA?L;~K z+Pob}kRX7EBS#~@K3l)1R8L{a22#fTr-Ok=cbi=q2c1YrpMsnQoX!Z7!=>6wv~eXnPtS9%3yf`L2vN@Q6#zQWP;}#m zwGqpmAwK@RCnz{HjVQ&>@JvimAVyI^j;HyaD^RjT%LwGMf6a*4RRDSos8{&KKDIMh z6Y$qRTp>h)mJ%#TfSqzcz74q^V~+tSA2GqvB!DLAXFCWcS-y&vh^|x!F~NTvfL&M=y*Vt{w!tC$ zZ%BSGcu`lUI@nKR+B$LvOu>v*FQ>ujH?5%8)spRsp`%AHH)pa zWnV@TQGPCZun@CLXqN}do~kERcPA2%V_|=X?)drsQX6;d9>?MDRN#+D^C`&Olr)IK zNdEyc=lK~xJqIAId}4pKj~4re`f-gr0ct5`Q}Qknz)Jb$Zt0{q zB>N_8bAbjAEO>50z{)jYO$egE%>XRz?^J|h$S-#{y}2n3mB{uu&|qQvUr+E?AN&|p ze~qHQX7c~OE_z9@#j!wf^xluV?YYxJnVvWtxiFyr%GY0sy#gzbs~dPpIStR}Vxm?F zy)E7I5|rYN!*~c?a7D(QvMCyX2tPL3#hpB0KhA%;Y6PTG2--c7ItBNlQ$6v)*G#Sv z-5aeNFlqke=BgIkAo3eTi2LjIx6-o%;7GF9WLwtc=d7;LL!^QBEp#0CYC9fDwd8+N zR9_gmgKZ{T%(eI1z;Q0lh+yRW;ttAZrC?$<1yXt$M)D59zDC{R38ZY0+Fm>@vfgF} z+#wvZwt(Mbv3~aP&qD-%YMWsJLPr4&z*7}V%6sNYJ@-{tbniX=3Y@L_%P?mEUZ9#c@#^R!ry^I=@}CH0!kFBc(p0!v0}MhWyx zs!XbbU4wY3Wur3*XWZHn_B;U4aCz<@egi{jHS)OZB>FVMdr=`a&;M$u{QK9#kEo6v zvgatcZP5d)L=Ol^3Np`XZx6UP7BoN8m&cj*(jnSw^UgX3si(aYSFXidX#n?v`1S3k znj?=%i9b#lF;YOJz9{$*|DqWrdkCNlftb&tPCtrUj@a|}`Z(EDQZNcPJcr;$+u`p1 z%wzelIVId*PplniorYs&kopZ_6yhUDIznWU8^EieH@Dv2ykVl+dK4X6%Fh)viC~xM zJ(QY@@O>nhm^lAcdn#@LJq4~9^dvi;&t}$DQz0sLi zk_(}kya$;PadE7(MS1OvZrp=xT*i92Rr}MBt}G0bLoi#81Z)Ax9B^|VT%!~NamN_% z)n>@pHRK03)2k42Ut%l?^#$U>Yal;n{HQ)NWVP} zBDvU*ECSFyg5K=aG%9SCNQ}m2T311SZJUw9UM}DWD?8i1?+%kLz$3a9b*ZMmI#;bwprymXdWG(xsWL^tQ?eW%i??TL;oSCZLs%|~Tc zRk6dOVhjMbhx-eZBSjxr&KOAy`8uBsC7(mpm4xFUgLLZji1>x~$X+?@Nyw-q6z~FA zwdFQGcX728+XDEA9O9n!mieD~wb)xurr$@`-|F7X{Jkj4-@iZD7CK}vd37sF2nce~ z5=m2L?TQ3^GX-Lqg>3~`>7m#iK92m5c3JhYk7&QicJHwEEpUOS8#4fZL&`#ykLhs- ztwpCZY|!+8RNSgJjdgGj!1?Qt={88Z3ovC(M%F2$?^O1a0H2Ztz}ed$KF;NrH!sF?W-MwSLMu!1lY7|>%P+Qn*Bvw@=quo z8kcod712>K&nH4`UsT1#)^_9<2S~j0O&h3zJ=Kj|gzCe`o2r5}^x1%Ue+vyox4}Iz zd3B9;eql?yIcu*Zz|F@PNrN6*y;jEEcFiRNMgF5#L%fn6sz4O;esq%#{QT@qn=8W~ zUSd8BBV|QqA9ItRlwNZ*WB0#WkQvJp83eX13RjBwU>aGV#QT}8Oy^6`ZjYTBEG^#h z^lp%2FQ9gAzs;3`Bu45RWM(pbr@9}>!Rc^l19`m<0Yp#8H^<;DhBwK*!8iC^UP5?K zJ<(M~HhC1ZSENxRnG89bNFzUl^#HF?ScyywY7D0iYh9qSI~eNCyWzf7fexc^nS%z6 z#CzjuuCbQiC7-gSJi>N4&kz@!MYcB|`M`|~*r$)bfn9D`;q>G5)$I>`CaoZ?jIg5Z zC|%S>FqD&F)2L6AdVTM%@fG_4!)b*+;D_h;cW?~$kKHi4Hf!E@jlA?8yJ-W1!u_ps z2rilt6sN%^AWnN0#s^nERL;}+aLZ!;0Pq+T$e$j{1)Y!6rNHhSE-u2>KwaG-K zMv519y^A~|KZ|PVo9?^=sH1?b`XWlbsD%~o)%{PKwh67wC~F`#-*^Inz@4Yhg`YK2 zRT2kLc3Cj-u0ngaV0_WIoa!?T)`7~LG*JonBWV_^%J;B0%7?z0pD%2Ww&H~CbOn$t zUPzH6byRayaf0dW&aY5*?H}jkV8!6zJmSFaYhXn<`sbAu(36h_6N(c1x%5AD2_g4bd6N~24Acg)$JCB`o_4a=TYsUGpCp}Ymy5s16C zlPU@JRxIWPRI|ci4XCJFiOu!cLllu!>t1>@A3W-LB4L4I>Q}>17R4B}8%;X~7PwOyx&UkEMQHl_zMh+pAL0X1^X2(_Y#Xx?=(fv}VuhZIGSq8{%zaVBI zfm1kiBrg!Ef6gY;+GjT+t#a?wXB}roF^Uoq8w{NvmX_%s|3++D+ad+5>#kT?_yjJn z0m~iw1@6aP{=p)6(raQFjPw^4-YS*@NJ&-J7K}!4O$#q5r{MWeF&fp-n~32Tpj7Bb z5{FkC;Y~dKDidhKFSUrAy4KY`o`9xfm7eayFSH{?3@Gm{Cg2Isnkwm7V$O=#%Ujh= zM#yhqx^IEryYBNUUK2ArcvF$US zp+8|Ru*A;VwapFb86B*B6}?(~1l0B#tkyHW#~1Ht=&{%Fwd~hsy0a6C~1og^E<#wTqulT{N1{&++Y<5zt2rKeDXC_JcNo+FQ61xT&j=t=H-j4Z&z1IEh?K(nntR@_7wuM6f|OMTvf z0ABPCzn93WARt@e>oA0H97#x71+x&iD5nVM(&oJ4@P)8B9lWJh8;!h;*3_eSVXft4}$wt7ciTYbq#+0?0CPiWabd7F-IZ8Ead0%C{Ojm z6+U3QR@|K%v5bfD-*WA9dRH@|e>_A8N*u1Q7{PgDcRLxAP#jgFy= zR3O{ifgo@IV8A|^Le7m%Vp=wsd9j!Y1wajSyabVsKuB7kpI;WWGE4)oii37(@O+{a z1cjH>F23&h$v*m=S?Gl(WeK29tzV51=0&#OU%0@=+geJq-dZYz&WH!cvh_6#oIZc1 zkjrFr(zQsz~e^uV>F*WCD#9Yoil!%{b{xoDY*5|B_@?t}+yF(2<$KIOR^ zD0pBcd|hn?bb}<*V28%jyR%HOPp+#d+^Pqn5D4{O zb{?78-!`l(k=Z=q!@OVK-65bi|7RJ;{^P7WWFJ4^wE^k<<7(;1^YeZaW2Z(IRK*G{5T8HtbqodQC#ddZ z=`t*ICo_N4a4$Arnsw^E#PiBJbm{gtZ=N4~Rli>Aq(S4-=*TJE1df@49EbKj*%0}K z;uN-QCtL~DiH><=_3I<;oNVCN4Nk}kYO5=znYK@531JF z3o52l;L~xaLFFRn6VPpi-icrdrEH~j%pw9Ws~SYZIJ$-dn*!>zTG|@j0UJowmcUe^ z=!MF6E!9?7hMFC;Ks$AVBqBNs`2_LQfB-S-><_x)H`ylC!I}J;X6me)-2vOFypILA zFOEEgjM@z~#O%PtEjlAxqu#x{182=*$5PeI9I~pn3i$l)XRm>Dq~NrzqY+f2VqZ+){+H7>3o+?{W`f$XNgm-!%hE zx709XvHEwa<48_73qY?qr&73f0sUy`b@5-rziiyp^RPGC&&Dd{COl8{w)oR08E|cC zz&I#9dPTkELQA(5%2}>>w?Xg@U#~;f@}2jPmFIcm38MG&9%t5@dp^KF5fvacqT4`B zK-UgYZuHqd(zjfh&dFFU(QTxn9D>g1dRSR{Cu@`-9c8vk8Td`S-br#A1D@SF`@=2; z%Z}+lBv>P`a_sb?w)(wuQFJjU>9Mq9pV7xMf+p(Uvs8^rb>!t;Khw4LjpL4L07ut!Ei)kzE8L-!r;*q)WERn>az z521!#1~|k`9HrAMYC7(B?l0R4G}{b#60gx#qI?c!3=o*B*BQ0E`&z5%JILyfK$LP@ zzU7bB74}+VT&SczuM2Wn-ik*=dCZ(z>_Yo+GG@X-7U1^=hIT~^5L<-$bQ(!E+&nQ- zjYG~lH%r#Vn?Nd&0V|;O5&+HaD{$N65C4v+HNZ%wiUu-r0PS_!h+md}8wpHyU`=79 zJSG?UQ2@@$KXV#rkwOiqM1E5Bo-GUThVd5gPb{cvH`^s2PQ<8oA#1 z#dcsVj4anV9_(?wn=QAm9&J;M#-E)5We~T*2`{=Ld32Jyzos)J?yk)uLJcMIZgxbP zI!Pm&*J)z`IN=3m>Dd^;Ri)?Q>P2P~4xyyh#wmWX4F8g^>h+2c^sACmw)M4%1UEqX>op$c|&*sn)rXIEKkB z=om7UKo96R%qE!YpU*M~`K5r>e$hy&kvF85L_^vm!FIjyv!c6zJw}nwMOH2QP+qG7 zd=dcK+52=DovQ`_@#PN7>6+-&6D@o~7x86<>Ws=QBDK+Bqxf|i)y>j7NVnzyODaFr zSaF~Yk@`<27mnXzw$~I{zv)hQV#C&b zx?B`4mJ1COcF;oRPFd5q`;|T`qpK@^-{s1cG8#H)T=2+?oU?&m1iT3_)#_bC)~BZG zr7uGQd(npF^6jQheg@ZLX!%+f@w--=(m>SUDQxQ?l0*w4$^L5K@Yc?w0YO98L4Xy3 zoP|S%K8?+h7$8ylS%oUi-~^JGtABo5y;9mp@=uGnpsIHgpYu zzx|CR1oKrVKQY4$&)YXUaT#2Zd5A)NH3|CF^;9=yVCwzq(WZf_4e~J-8p=B)&ukr* zSk7+m0!%Neh00ER?}TcJe3Gk0AB|V}S$OEi)s10Y>C$_w3}z6AIG4A;d+lauc_TTO zB!lNaY{q=Ry;%r-Kek=@I0PjHxfH~mC%={bS%ym37w6APuv+}32Gm{?TucDYF|A}w$)ZYd7< zH&Go0aB?Ze#22L0*3_*uZ?CM%yL)ABWDFfBdJaJJ5p)RPV-jX=6wA%hnq%<|p2tq85vvf%3PEhnsv>jX8w*AdgCp zX@n(G?`_tnBgn6~mk^U%Eq4phkjN&c8KNi)+DbeIm#ZaD?O$S><}C#hEmTR9WA(Y4 zv5fJ1mWzj|xg z-lLeEZHE>Hk6*V}mh`mtI0#=Nl-ZyoAK8nj=p-R95DSwcKY84s2=_!C%D!ZWz4HPi~6i%p&I|`7hum{kh z!@sf0(59K%q|+(R5dpbMaX@XQi}X5{`CIX8TQ&aaODq=nU%0Q-!Mk$hOOzM9-cui8g`eBD_Ilfx}*uTT(PKd$YX zv8USNzW#^py-f2j?|7t?VX!~85k5WBi08CS?evQneUOLNX29S;1wD~^&n@g0ZHAU& z9a4hvz2DM!vn(uI!KYO)E1oVb8gQ#%Yi$XoROsP3y-UQ0`Y=-2JvLa!{>8IvJI!DA z+=bOC>w(ooxd*fKHVTM27zGL%ps8#Fdw{fbRu11Asz+7w+$`U18b7(YWDg zV3%m+0#=nu6y8NS3&cLf27uRbvS9O!um$OVchZTl_D5x}qD_@_!(sDs?#R5EeHqUchuTXZ3FFRu8I3COY76q-Q} zOTEP->EWEJuPe(j6O@S|wlbZ$us`>RW$TBIq{Q7*%lyV_3jc=@9m@wF?O?DmMQRua zR1)IXjKBnnV@Ht_St{atHF_?+nTg?DcOWgQAYJfpkt&yTpoMjWSWxOellw-SC?=WSPz=y~`R3GP>>=t?E-vy(VPSDa?wzjzxHQ&g zX{{tiPw0(Lj!CL)3-wp}%Ea^m+P?`({LB9SGiaXBH)%dq#b%+>QJbFB)3+i`at?Hc-g|Z-iUMLZ6IcXqD7P z;TB?a?)=rYU-Nurf3o%8kjC*oF?r~rg%IGC-%3__SBq%)*6ZZJ($5o6P);4DQiuuJ`ITk};pUVQ; z`+pQ@1&(+VsU_{957hQUk)L<7eq$*N#dY5(St*)c3)d}H^x!YDhtl)hpz7DF4EGH- z_rX(KZ$F{@jb#t%NW~X1Ve&%5vdv-KS{dh%k9D!$Rqk%yM<55%3+ns{O;l`ilr|-q zyzn|#B%A@)NXOF;;;0?sEb_+ecxz*1seM76*)PvwW79oaQfrizou^)Ewe8^5w04Jq zwUD6`U3=~+uR7vg$hXMIiPR)HcD9@vVEE(zkjnn62fDG1djO6geN_SA2pl>Vj!-C|tm*Y!55;N56gGe>8#Gw3WIx(H1Wiv{NKFyr|9zvAM5zFyc*0zjUn1;4S7 zNplz2MM6D5mG@vF#Fa}gKh}33JPYfFA{OJtoeR50 zDm8?!JCQ%;wQu*A;yq)#&k~h(_dQ4mV86`UJwYgY^M>cLyAK>vQok(YTKA_$C@li& zrPt+H_>Rj`)<7LOy?@~{cv7hUVj6byXKGxkqk!@@{ZIqv`1=Moj9*2# zJGzU{SBnV4I4vY3n_8oeL`yDHb#qwS#Y+!RDK&`Cvz*oY9HGZEj^iDJzb-8wOJFL< z7Lc?cnvgTB)9;`kVj?Ij!ynE*qeS%gbPcj$fckI>Xt3CDdU(<8VRy*F*Fhm&Q`hJp zh7L4BLd%@}5*=dSk=-eehuH0D&9)Tk=o*o|A272Upp?Jap$UDPi(M=1tT{2W3)~eNNg3=?MEJ|rGPxf! zhs=6R%XULkQF3*FaxK224e%tugN+Kjz3sRM+z^N#|RfBW;c^zecqjO5wF(-Tm!h7ZJW2OdWWKf>b~pmbSeSm$nlI~` zU7aZCpJJd0JMJ#&Y*_)c5M*e-k5sKlC<>ws4F+OD#bALpM9I&FgG;UUq5CWS|7!3q zp??ep!?H&`bQR{t2mABje*D|9DKvX_QO~gqwFOy0&QoW5|6yh>@D`M&ppEKIRMDfY zy{(<5PW=qRH<|8mDZL@U@E)lZf*kBfhzUAb{GfRPq$jJ1rnLUa<|g

IbYcqTjH< z(eCXJYV7k~eR~{bdFgmvEg?8SbyGE2mTyS?5^w|w28g4(=1EShyWIidSgcXD=0>I6 z{ipL??U+sK=WkL%F0zUsdtHr@fifN~Nv^uYgzLkYOq}Q4|2SP9rD+B)Q^@X>OBHT$!QS)^+W=t zLzVSnM>k-PFC*42T@gK)(@y|PS6igN0GZ99D49?INb={OYm6wLxI#M`mg#2UN8tKI z@P4#&0LFK+5_3xy!!zNo0GEHz&R&@>VtMCh`LrLWs5s@ax+IB*UhE;f8n3bd-Mr)y<6k%Q@Zh{@|A8 zyHaX7KQV>&AFK~sMnj*Sh-&}RV$~zs*Dt!1#!b`OJbl%SZxa=@o{xoQnNJP&>5BdmkF*-#fp9O??P) zWx;7&TQWlz*8~2x%%wB6shLArRGg{{KY_;o{HHnjuQQv7J-*8Cv%)ot3>TWw@(_$W z>`bcDLor#OvQ3zei1DFR&9r333YVT9cEa3AJW#GFq7eA7Q@K0w_f2F1QMpn6Mr54T zrxnd~n~860smkMFiwJ2?YE)BmdkRqrk_XM)S8ME-6lX*uM4z^?i7&cUfESc{25dfr7P38T@a^3}B+(CXt4nc1n zH|98#4R*9th9EgmNgNoV0zGnK$f*Vs-A_|bMB0uak4H9}e5ML4XVdW*&@vr8Cc4pz zBmS}T9_sIH7$eSaZ^|blzB6J-4^P}ocJOj7ehihZSe-=jY=dX0r7?cyEnCNQRB7A$ z?t3nLm>p2S;+@%H(oS>%1|MkhF`nPGM&SS~)Z^xS=Pku|)j`vVv8VW4;Deov19)~p z8_c9$AfDGkHr9)5h8l?A{QztLjKlWm`QClVX;nK>gsdkb?@aS1y@}tut*3MAq_i!a zR#vP-NzzoY2J2Xzc`T8r0-)1IDDrm>pX7ytKVEh3!wzQJ_u>=gnc|=oj!C6IZL5FpL!j?>yBa;=Sl+uodr~{2_pmJ3EEdDZ4FkkeLacX1}be zi?V)cTk?J3ts52=np*=<^m*JC{!3E&`=eg_B@gW0+_A=+pZ<_lX;BD~aj&X22x272GRxHFZpp!xOVD`qkb& zwvV=@kLa$p8m_)!o&&^6Ha+}~dF&mBP@xy6vNQ|D7VQ*gh8vp@yrdV!{Y(75;BwYP zonigntDl}%Bz8O!h+2+Kf$f_f93bOVNZyGnd?sdA2^~Lg67MwjXlDcN3JwBxi8C+I zCuet=f3h90*$VIL>PlGM*iA6e81&Fc!ah&9*i$HzClng%<2bQ?UBXeN9CY5i@HF)* zZK+p)v`ah6dUi@04^Ud5Zy+rPYHp*SNe7{O*IcgIe|<7(v!Qs-2`!*3DQ(B&ICj|2 zeKUxWecKI-OTj%KGDt5sQ*P1+rF3A(MQv_i_c(s6=tKd=+an;s1xUSIzJ0`jrC`F@ z?TgksQU3cjYUU>)Pv}3qBB^Bz?@)9S8QwfZb;j)l;&cP^h-nOK#RfR@oDM zER21ktZwMjnTX?nCN>_^S==98i}J}x(5}V-U@rQMF8G=IsVJkg)MpGkFfG(Td>!i- zMvd1~;Y;DxSTba^!ZSG_Gj}MIcaV{EX|9Ye9al&(p{Sc%N9Yn14NLS?+vYt|YTbFg zDJG0)HWq$g(YZ}x2jE_0aA?3YHy!<`$>)gS6Op=$%F-sl?uee%22uCd0SgH4_-(x>W0sAT!;8O!WoSApFLUuvJC$bP~o3eYl0j+R4sJp~LXkUplfqL_qy5spGg}AWvWxF6#HB$vpTIncb z?;k|szY~!OL7*-22KuqWa$RPFnheclQ}(>6iYp5^EDx@(R7fw6N$BD!Q*RyDd}pCmm#fiA{NM_^?_z9{>rUSiRx z+WFLN>b)HWK2}o*#}V_fkHpl3l9fa|4I~LU<~Y0|#<4$QH&q9@kjfxh`OsBEKSF%& zBTNhxKpi#3iai)3EUS(j(kTZ}z&WS{W?%{Iy$5ysx;m#BFuIBGuvkU(!Wwto^_*hd zZo9e4&$yj~4^4OhG-!xk`QygFeV&a~Q<1xML|ZYKDt?yu8_N(~FuoOzlFz2) z{#9e`&{Fwu8i57@uur=q@&h!^_~fdAEAv8c&M{zpLYLma&(9U8Na&fS6W;0}0mP># zB;IQ7OP}#LOQIj9U+)sIsJ~Pfz{w zSi~4uh{OGTudXMfR>}9mW<t(bo5BS350gOKEU!WZ;3Uklvu}_ zp6i7%A8`j^ns2_-)vBCJTVGN2s9-1F-a9jMy9Pw#Qq%82XQT}bY@7sJ-bxK22Bnk^ z;a~SG4eRTxYLZqpmrZppOFA(Q$FMd+8DK}&W0yP`)HXbu29iGvj2@|ZAnv|h$r9=> zs00vXB6rw*^?HqWx}+E&xaAr(-V9Ip@3#(I2gN&YXTu}*Xw;yEIC;#5>*{pp(J5`c zlBmDSi>fYXX7(bb*u-sz`TeNYQPs4gLus}Bs7C01xan`KNudzuQOl^^h#ul3+SEO| zoPFBr^WoD%RBi*qXt>|;^JY`B`^KB9pMt-%zjw>`EJ_vE%eYi_*91gZ6&Wjnb?)$; zDb#N)7&AZL*VN^0Ay;7J9t7_=s=-G9T!Ha1bYRtcB8tYe%5Oco?~jtTXO|DN!!=V z)?_8wqkVr0%Xki0*thBa%z&_`#1FtNr~A?b5NlSOTuEiiBrHxmxhM=-4#2Y3PU+Ej zO8doRL5nVy#G4z^yVCkSFXS=Zf_4rFJ{etelbxc z_Q@x?iJRiPY0e%r6a!#xI>+--hcZD`H75q$?=0ZgfUSK}<26^;1qamG6+6Qe$+UD1 zB(3Ti2$)-67+t!7VpO8@O!mH)O4cO~s$#_xedMC+9nj*r2isrR=%DN%2`Dg1Ty7v8vf z#EU!#v3f5!UT2c+j1{tM?!4DpL*4`!Kd84_G>X0}6J{l85OOd1L06SNM4BG~bfSS{ z)ZJ#$yvR!Dn`le>ZL0mR-!F50e}?xQD#|`Z@bw%$U)nsXe_Wx9m{0v~gu;KHXUnEN z6ED<|Axyt`akXcJ@}BQjA~PKq{xG5sWRNn*xn$eSIo=cnJe-Lq z`5{uwo3<1*_dLwE`zP9$Tu~{{M7S5HylM^!E0C5i$-V3Wutfn6BYLt`bmxTggjCM5 zmvSuQ&Vc2Ur#R?IFdayb*yr97*%K7Yr#wu5_W8v)NMwm9xCkfpPM( z9u95K`#AQu?t}E3IU!+~_FD*U?}y9Q=9~QhR(0wO}b!gIXebq z!yA|rQ@O(RUe)aFLDJ{13U!_q$E&np=te|%^U7m#$i|l2A0I}-&%XV}Qp~6~gp4&j z9suMH;*D^lyrw6gtEE-^Lra>UZ^Jq};K}X|^T`rTPJ)U2RBlO<_66RUEzpA1!0BE% zqRXzvc_eRSC};i(+<^+|XG$Ti`F0YpvV}2u4oD_-&ffoID`EeS-;r*Ci9E?Wo%KiR z}=069pD9EXi&0n2lB9Xg>u~p35xSf%Z_058(2$JNCHq^xxEI| z3?uTWRC7?D`Z?4^gO}{Jpz4SdlH%owg{D0KFOroXJb+qAYsMAg%aoT&iK+dh>k^Lg zj@h(X$D&U*)=buM-(!1R0R_a&2qD&CQv zfM@My5UfNjK0uyFMZmr9pJ(9uH@?T2!{DGjYW7nn$~ee7dDj9OH5RH@R(|Svxt1AR z*1z0HA!}&$vEXjeVwT5^ne>l!p;P-75T*+5L$GfRHGt51NKfiH7sqX@GFff=3-_YW z{Z0~S!o7hrybzvAr&p%D#vTj@cnc0~F-C~}!%l|}k^L6kviPUF-524W3N9WV68o3! zuAuNTwGnq=$0u*`k}Xl1MZm%)2u=avrHD5+^7<~#`SZDM$+~64Z96bTc85s506fq^ zH6HDYAgv2@L12-YM&$+vK-|HI>cjIG3AfRvtGN}7bpJQC8K`{2;9g3=33w_nWMcnwUHCc?2SSYuOAt3?OKYz~!{}AzhlAL++fC3(SVCcY4<_##E!UmM$}@ zlXqXBP*+D+$y9R&qZcss;m64Sz^Q$7MBWkysW}}N(s;B^-1!V*Yt7x%R^9PhFDQe_ z`(#tLQ0Q5)f%U(!;Qnxx_#f`WF>lgi8WVX^*qV>>ywDczKure(=xug68wspC!^=Dg z(5{1ZcFP+H$mmSemq5uciD&!(eBaL}N(A#d>S(mI+0P=uzVKL7BHCzRDB-Iqs_DuQ zW~31@JX7C6hr1(vfpjeuc?&aVeBR}RJU7}jkZU1;d7-ju2}!yJ!vj{<45ZnImZMG2 z5uWKo%3hinM?Rf;5+sVTvgKsinj>%ymOYS)Aj|m<})tNUOlX3Ocq;e_-|gJ232rtVu-3-JE=RO-;Kt@)W{INjF8 zKnI{-42ul>_hY%^PPxAYId~N2a&LU2hu`N8b% zwt`SS-GQ6x^&%X#bXU-GPtSm{c`k}~We0ReJHr^zvMV@{xA$(N9zB0e{b)se)|6Lp zg!izYBo5=g_4fy@?iL3*OS=V2MyaytJSLfF67@_uzPC9D>#Qd`WVzK$*v2MW6Ws}=dbbiFvZ zk&YW#_n?4KqS@82B8ukrJ@>($43sX2Ku~N+dSzM`X6oeH5Yxzx8-hbUX+(4~Nfk?sMaRqw`&$wEK_ zOyKfDe5lob{T|lOznQ8CnM|-`Usrfq*q>Vm-S-p}nMx&4SOL#o(rnA68hK`{+nwB~ z3sxW6D+?|)&ztNu7ofcRc!^Wn~sSYOIp zO0s=fe55wC?O?8Ak*C${fQ_Lfp`lk^_h|K?!~4QAP{wk3fQHSZF-MJU%OvJ*BOXW; z?3dW9^!mhBFrMmp6v}MNbU9c%Qs#o?ocg3Kjv5TDWvxs4Y1Q6?4yyW-S2(B6!0g77h;9i(K3pZ)Oadp)> zLwoM-oELAj*~of(N@1j)aarz25xbVzUdJj+`hKOa1N?zdvd~CTo&f(lbO}fY5VHx8 zKb;{9>Yv{Q z(}uPu-$Azbd@?80IqIE%-_wpzP%mSq1HNy}rjUKV06ewfndb=ctKkhG$kx=6{xD8;bm@_@lAt ziQ{9=;~xjhS=0;@o-CL_*R!A)3vlxdU672vYE9+En{Lc7s7f?8WhxMk52Sz`|Lc(c zuSN6zJHN9PW+=!YH4?@6616Rx4)vW@0GPEe-2)0IwDp`m85LGKxZE4&g^8BpOj9`ij%g0&N8BLo}&pp zHGRZ!@8d5M>y_`HEv<#1AN{-yIBi#y4FXAA&$e~H!Ah8B=_MinC@py)K|wQB-8p>W zR+M``U${tueNlgGI2Y^H?T+ib*WK!&VFMh_cM@2EEH^!cA*g+sAXF&A!Vqlvu|U^H@H$9yL`sZuVRM1=%*EL{Td`U-W zfs2IPzFEknAe_VsP;IO4GzrHbLNpVS_o*f?16{lY@+?&(VEy|6D&iRKIArf3EECW@ z%v!XgLLv6359OSK}Cc6tu z-K!_CmGhJ^>Ht#-t;5wrEsi5}5FI7TX5s4Yr&Qxhhk>=en&zde(aVg({(AeGF?h~L zSNqC~)|2wkZ%|M3FE<_@Q;PO~7(w>Xfv@suL&h3z4OPIIIm)r4PBgsH;sz&rqerPh z?rO!TV$`g2uxiZ+Qzx@AESei>79K7A0}IlNY*A~GPbp0FG(kvx79w8BA&EHQXbGPT zP#B)sMiUzH3bu*Y*ARTg!E{;op9sUK&VAeHxDP-fRt=+z3hJvDFJ)glw%Gi}s`9;q z@G5oc6w3|4;8>$=|nXp(jD{saX+iLDVCBqi*f&R~8$5f-i+2Spd$UJP)5N z@%sV_Ip%!3Z&^0nanZ7t+K9l>Z4UyQ!f%?!+TX%Y5$)mbOTLL&(i#6jeB3*b1 z{~1#AZ-0mV&x&Th5d3qEwtu%?78fVaU+QK3asIziFH7KG*UJ+8|6jcT z|N44ak!O=Ntp78*@mEJa;J)?2%!qfqCXSEPR1G8DuwN1u zAwYmOp%LZOdo!2vXo3s$>DiJPxCuCu5i9;y$;b2`{ZZbxzw4 zU}EN4JSJK?Um=EFa?IQ9LuK7X@0KM@9td_2pw3l9$4}Q`S*|7lb%N#@?X@c@>q+qw zoYJ43tX zOhZ2CiE+W`7uaY5T;@hRk8(d{nDdZKznTL9Xx{Yro!P~UBs^d!pA-W_DKbd5dN1=Q z&c@Y3YKsjrN+-)r%??P;-P@3JIb{)pWMKS?U`I1qvHSzn zouL7HmU?65W%h0di=S_1$Psiof@>z5lV>_s6J0CmbpJ==9n^W$jFP2K-Im_| zRbS*~gXvC5svrc*v6;xG6AcsJ13GR5Es|KUAZ7#tuo*t*3eXyBNaNq_^vpIppaAinDMXSAqYQ55M>T+dH?j|3f^s(9)sp#g=zC+OCe*cl%A3(KxHXUi7_(Ypcv38w@r z)%6v+qz`U3St*~ZH0SN9t;0(M?y9Q2Z?S)x@`M4-gUw6Q$edEA(ukMV3s|;w0=~FI zNwc>sSTm1ESFL}gKiIfh3jbw;jL<-~sHD-Eh2*w&PEfn8F(AgHe51Io=F)N{$3L|L`MHZ6E1FkRE7p5h1;Jlic{ zdvNSEB-EFI;5rG#2qbu{&lf(u!BM%IG9Kc>I*!wzdK9L#q&EiQwJu{Zs|9N2?XQW- z@sBa(4yqpaN=?11)U=Ow8WR*nx!f7Yq*jDzH*H9-AbpIra>nUSaEH9pqK zR_ZURu5w8^BC9mz(3!9stC9(;n)hE&)yCipJ!8s}lq5Y+n3+lOQps=(lXRYl81X60 z0YWOMEJ(ag*~B)iKE1DggUx*Fm&KHlO{xLl2hI|Mf;k%xKP}gnt#pUuU~4nc$658& zBwW7Xg<8mQ@46g?vtC=!D;ck;!dji9C8ES>s+ak3)&2Y**xy3zrVn4XBN&Ez3&zXu zs@0dIOzh+4LY_ETh=z%VgFid(xUbqNwB@HtAsFC9$3VRU(xt>}qI13Msl@0n#YcA{ zhcq8MSyN=Z-@#*o<9R31%N>t7T4o}N=jDwe4;L4HypbUlr(7xR5Us-X>Wc#m-LHsn7gdSPe`|^Jcee+8*Srms1kZjOuB_!(t+<>&?J;EF)L-~us;^c z)&Th8q~P@vZ`9Scjv4psRZ|E*6|;>L@7qNb167tXbi!L$*iK{40uek4E7f<=OleF5 z#DEnNlAJJj`i)wAZ~{r`{GN& zQQ7te<`pheT;&)rrsiGHHyyp62c^|ZB#`e)=^ID=j6+kavJ|zn+g!4pd&IlW~zzo%Q5%h;3 zs|UWtQNK>pbzf=wV+B6IiTc{x275S{{0a7_XUfZ0_;fw>pWtKQ+cr$G3HyyDx8Ag3 zm!+ndp$HP%zGF;pCXtpY#8dB z*Jdr=k-MHIZ02>UtvR+jM!AH`m`j6$#5GBXsZPb3y?yLo$!Jw+%ZYti)WLKxa}H5} zngurAhDW~3r zdLD1)hnAPBdv|nfsQ%$rO4{LkxV{ItjaXwx``-cqXmqKk0}>f$4taH^R-G)v-2eF@ zaWoB1{}0lY)zRu_hiuChQvA_fs(^oY=h&acAA2#p`b+kT5^jCr&o7cHfE99?!U>** z%fcyrAU@8kUn)AO@l-IHMdRdJRxyYm2MtH7)0nMtHf{A8iaO0$ zzPwgmt6pc8dL+)|#lpU*!s>Ur%`)++kvLM8F)2z`lJw>^x%taS{_gyaD50bzmQH&7 zgIx+yNwHihr$9?gYo?A2H4#MrLq$>wmuNe*scxudj!u{MbSy{_i|fa)#%=JaRbMsp zdB|9$sHa+)V$F*(OQ26K{(zYIm$)8my{LJ&x$=*}5BLl#4Jf;pcQ%QHiLAjhaK0iq z!4{58irquUAk{{EY+>*Dqiv-%-3bSWS3hlCMJbatj-(|9tg<@~NA`tAu%yGT~;41F$Kb+JhW6_6I zKpH0q`s2kZ{rP~Kpd){7Aw5ct6|olGeszerc6}#<9(65Y&Y=exTjQ`b@$HrI+4LW< zg=as3e8Ld=v##G*q4fLTG-3TL;g>70w(vjRh*uhG;2QLow!!|fX=3#O7<;;Zzg;8I z4};1wIdLIu0M7c?hMO`Ts8v2VJ*vI5gxfs) z)JCoUp^|`zOEetXR{Al$;a%p8tYH1bcyzQ+(6d~MYby1>@%g9d%70V^|F0_Xe~+#} zObzWn-r@b%wa2AxKe*iar^UzL7+IPC&*0+b{980d>eB~%qt`|TR)$7*fKu}RS#R7q zX(#!%`?uEfc9Yc^8pZ|uZ(es~uy5WCa)!=(2dObn`e>PnVuFC2Zp>yyc+N;~VA6z4Bt>*nIvp$C}liec}N#-w&q;{{i z6Ec}As~U)YY;aEZ2Nyy2&)#`F(at&C5C=NLm$yBQh+=`9SmhQBKb9 zA0wm4y0?1%HzCK5F&vgOI8&(?T84Ha42DHe?gLahmY=gpr$_OX-XvQ1Oz#^hJK6nl z^hKj5MD(>xrw5BUx4OvU)(uBCjC@*u@=PaKx?Yng?3LY-E_ZZTp~Gp?XY|VO2g#nw z{BC@tS?XHa=jZ0kGA5uoB<~#JOY*xh*B0cx8Ihj_O19=@?^islE~7v!sWfk{G-d?I zcAT069OZVL@8{#FBHUJH2WtkP0%V{4MJoB8hATLkCI&`&mz;0}OpYQeBlcFAM)Oqk zh!V9~B9p7K7=_n(_Xup|44`b9*(IlypK?QYq;4KN<)FVWC;E$qKUn+iVoSalJH4wh zc=}zKuZnVnI*%QRu>UmoYOZknymD#)hWtXDajp|9*H@bOT7He;huKB-&A|_`?;hWx zIqGOij)!c1vDR~&Lc0+?7F*Y##hKJQ?xXrB1pBd|%0KJ9!W!iEU54++>0J1o7x|nW z-Cq@pEGvtmQa-kq3>qZ~nRfCfvvrYl^}Tu*v1G(S<%E9NBK?S*ONh3iAbx8DeLDGt z!A;t;YpdWTLudKX{DB>I`9 zfzDgam*d*2;d?Bwfo>4unCIL)KzT1(%`N+^U!LrT-BQ^cxd5_FV#0K}_ttdjk0d6& zLwU*^ZMlnb&azdmj^+fs@aNA1kL5MVQ-o5_rim_fFkg;V4?w)e5BQi^Jxq8`U3Gx5+YJGAZKuxpjE=s(;&Xo7OpGe z*m)S6u!rG2t!nCsEFEi=oJl;UI7cNZfT5dyF^%1nusfGetNEWSuk}~Ng6*`l+lRhx z)u@m?SN!;U3p`_bceW{K=7F;Ye(j+z%ca1n?2!)iC+YKW;Uq!R=pCJD#?M|Ulnre0 zF=)@sK&|rmFL%SYl}WC)*ta*v5U+RjaHP-{eWx9lsF(ZC1|8GTS5r#h?<%?8)$v6F z7gQFff~%1xO{0@f{Tag%G4=_re3bPNQlO9a0#jk+*$PGZWU{k*pOK**UT2Hvu?F= z>E=@GQyq`KdCbftPleyuAj(2>svabA+B0~j6kqQyJ@Z5jQv4)5m;TC9o9n36O^T4Z z3Wgk07#r(JC~};^OA>Fl*OVwZrM>vVK`=nB~N0-Z#~tf;%HKfUNJHk zgDp)67gN}vF``1GeAN^#$*%Nb%iG;n9<~>Cqed&y!se2OGuQ()#IIf-FY*erRD3In z6FwP4gwfdyJ*pR4<*TlfOm?GlUKE7PahtY5w)mnU>Qjn=W!!o{y@=xtk3)-fZn{ub z%~TB)PwLcs=nXOoj)l*@t#D{Bz6E2B@`W?F+!tUW;pra{5N2sR z@?M<(*qLN?{OyLCAvJHB@c)cUtevMdWjg3bCNUu4xIifcwS+ytCkpI zs3W&g*Hz4)+JfTA`hF`wD z&Njz`fBW8zQ?qJagJr!@$H%*m=GzQ?gz3zY_-o(Yx@mlSs}-#Uei-#aU-mlqG<&dR zAElS)7rmZ-{mVz4jnQ6+0%2E7zQBg}WGrd0%#z3-MZPxn;dX8L+lBUL)s}TePw2Ys zL8zUCuTKLL{Q4n=%Xdb}9$PZZ*S$8NloM;VFQ&dBCL`dvlRNp6@or$n7oW>4fv~NLBhdUnp+*P&+$rt!DsE z4f8DKr%AC=ZCZh=13$KCupCd*zUsCaFBV*IsDDLB7cs(q@1 z!rKenW`<#o*|}hNzqo5v`qqaZ&4Y#AtuH4~=kWDW+04$WgpEost^f^Us^3boSHY`U ztt!HdLXr5)?$d9*a^67xo;<5DjkQ$HGHPt-WJ9s0K>%cawVK#Z9;ODip$|)3Gk(7( zkTRNew1pdEEwz7i9Ie^^e!}wBjotUe(jd~h;K;L~|jufp!<6Fe2hZ_Z=L$y|uIZ@u6^O zO$9ia-@ljk&E4;Ong+LS_hSPIFFzi(-EE^CIb~bld%<_&M?D_U`E%pDNv_=!cBZCt z3`%8?o3Oc&}fQpr53b6NcD!ka{o5vDQIR%_Pd*9A~1pavqmC9nsm%s z>Jjr@*QKjlJ+H=-!<$|tnH~$1^77xs#tA5>z&=P&>4R8cFY-3G=T=}HBq;M2Tpvo# z{y5a8r~%j5Zlt(NmJ-8ge6Z!M5pKGD5rVZTRw{v}ecDCI09r$%6fiCm6WsZJX zmBdLGnc>{)&r`R+S`-vkrS@`GsCsvO)5y0!8bm8##UFNO8p0jc%ksyQYWDs(aEY9m z(cBgL<5fq-*RLYOE}d1gzyTUR>!SNdL*=v`AFzUB<8TVKYr#Pf~ zd-Xl_rZ|-CO5>undG7M^2j>Z%532%$(%tfQ{pKj&!&zI74AA1+Hb&f#tYsMdeDgi@fxgr5YZ z?+@7e$3dTy@xpjtuiq;#`}5fb^*_MWx6TN2%<5l&dkNTh!45SAg>D9u6H0t#FF)}v~Hw@7+p)?SMj&`JC( z@7&mp_UVSPLo9AD(!%V~Ns$lKro5fo8<3Xk^GOxh`^R#*)_i8gV5`rC6kyPT!5G23 zCE3!NpUxNU^_u4MiISiPK~CXADc0v7)C*VD`Rk&-?(UN&Q-9ojv-m@MmQ4PyguW&=t|2Bay`a*wIx$_K3)U$%hYme51k^J_-Md++-SY%@J(R{>*0bK z)7;bo$OH>+Q@RHm;&1FG6MSTY_7ucfb%P+OiNOhc_H(nfe2bjpT>yhbV6t%~bAK53 zLVOYveS-^o+Sm^4A`q#^McrWjg2RS|%eDeNj28{-QCP}&YV@Q%PA)lonx(G&%fS}| zXMXL`uTp|s^0_hX)B9eoq*O>qeCWM+! z6{fYhQOEYVmO@)-5*bH_>D4KlM+nK?p*3&lXsDg}!t0m4C>2d=*3`=^>FNW!`Frau zB>kKIXgpuDc`0t*ROevB*HzQ=(JT~jW7}iE=~cdN#q#3(YJ$w$Y&?K6=SYX>YK=Lg zRh>E*l5n46A&PbJe#Bv%%$*X$M??jmmptv2SrTF65FUUSr=7Aqw=vk)9I~>SJblqJ zwu(cydcm7G{D3%+rjYMP$+UV0DK}jqFBOJ?-6R~GYv+B6rV{;DP|A+4Z(|(#p6Zfm zUM>1JtF6jcGIqJupR$n1;gW`pQcn1Xqe?pPgbt^SZ|L^c*|RDj1!!XIkQTaU?mS7i zgj%N=?FPz>q1dr%pQnwel^8X7ktUJF_SLM}VhQ1G! zZs|J*S}%h;IRt|d5LZ(Ns8YxaryZ!h(dEP)-{5+M`H(3$QJX8*e^!&WGE+>{Axu=P_^qfJKCuy;^=0)2l*SgUDp}u;0k3+kr-srC% zl-NOB7-aAspG)~r)|zuY;3T$XjLlW;u*7pWf{vaWwUk)~<=D+uF%->dKHi5DeP$Im za`irE7C3L&;&X0}P89QiAc_ZK5SzeQ4u?mH`yww}&hkmU!yoL`p!s?FVro5NXIRXP zv=PejAe1=H_G;s6+6=Pp{FvB+$+GuB#8wxBX7qU$5&}A_$@6z%fKi+5r$iuXpG=mf z(uhON_fT*@@&oW;3-Eh9#l-F0mtc{;5LaPEl#I0JJjoP76{>-E)r9xGQ?jk5PGWe& zC^vV95rnVZw~pnZzqJp2C=Z*^XKwF_E5P@!^!E*#b#ilLURP z2P&;Ox!@!j&q^2I2Xz2*?HP`pf*Fdi)Ob{ILFO}teWTNQwyYZrxXF4!XX~dPQR#N@ zI|947I$yr$?wODz+WYvmJ8Sf>qsKo}ZhVzZ0l#jl;&8Rx+$N*~T@vJFElj~S5qg_> zg97JTe~3_p_v)aQu%DOQpmfj%4u1{r*~;yMAh4;PeX;;vo-Vy&lrL*V>bIRuKf*ay zdrRTx=z0hVL@jjjMAK8X1;Pj4&s#NRmscIy zpW(pDy~T)Sm+Ok4O{&_uQIKTKO&%l|x%|2YHRX@p#p{f}7ElZt`JY!w=B;9IOkrKh|-unHBP$!IG&h$flcY<%G zXE0h)o~m}vDbMrIo1gZJf_%!B+4SUr_WUYNVZ&OUeaD`7Dma|VqE}4UN^=*YXnW7Br{D|#Xg@~u@ zsw|8@1jML4|6CvC3%zDf7B@g3FKBwff1IwZM|;BmvT-6%MUP?*nRSOqvDorWWU+#G z1gEGn>1WMRybL4cNRwIrc71ZX6Z-JQy#Q_hcCJwQ3c3gV-e^K|%)By#N6o8pZhHQt z9|Vvu2h&fgyIw0cXr6VB1gKam7ba83lj?+!et0zX0pn<|r-}i81(Vy+>c(p!X}7-h zfWF?>xA~!j!$uR#?H#2kZ}`wMHe1EpgbVW1QyI)R79~UCC9({u0hzf`zR3s$xH6GO?nEL++x2bq@Y#^83*|*S=@yzK zYyRYLrK#1W<@^`i>u8^ho&;VPVI6wuGo7oe4`Od_^|l(k!#!;wqlrJ^4o!Ouuls1e zNy>4xDRimPn71J|SCiqQ?Dc6P*7JZZtf?$@Ym>~)kflSHx7=@qq=s+kJt!YG=;dir zRhRb!<+)Run14^=39M>AoPS0#&oIW2bT4Cu7)nOYQ0pT{)=GJgHuq}n=D5K!5x!WO=rD0RS8_Q|5QMDNF$N9mR>b;&CyvRS-_2I}lwEz3eIvS9 zm*H6xF`TOfDR3*R)6dsW`Q7#i27tq)VEzHxIK*Ovi)-lN;Vs0WkUZgjWL8)G)tZ6L zZFp{UPB90Rz4C!mDY=%!G(?8|)ziq6Z+JQ_4UvJX#~$?7JngGubAjf2;m60w;SDAw zxjPMovB*+3D}i_pvrRwBYJinJYvAz#x3*Pwl0c7GV_(B1i~fy9phxWk8u1eDVug*v z_D@|N?hmIY%8-g3E+<;vHNvrX37IO=&|Queu-0N^IEAyNvG9gZ_*4rL67U)45@tTi z1F`{cGvkR}vvPilT{)BmR}@wQFGl7LvhkclZ&Yd5T;yX#ObXKFp(w#5@rxe;8IBBe zNqE`Bhn^2;8Yj)Y9&6rCA<@{vQ<3UUj*S{mczh-(tgi7F+tKJnxx~h^9ng;vu{+IF@9kIrJRFXhTg(JMOoL0-SW@Rc z@<}Tk+zq1$T9rm+FQX z^fo4p$xpivWr`a$tQzN2_Qgyq?#FhmMz)e%bgD_jDxVEIQhs;M=MJm4PwVP@OVQi; zWKv#_TKk6+cZxEY`13zj~4?Ql^pkrZ`TTRLcIdJs^hrK;}_J11U1 za5EpeU0~nstw5j&;PGo_bW65|DF%m{VuzjjOrFbEusprp_vvVpNmHRO4`ewHD}1Ex zMbtjceA7dF8EF9)tPDFzv40o^c7NCK?QQ|5;6VeYz9u++mV>@m$RcJ0RkthoAhM^k zY@g-}aZ{a4^5)CK0X1x^ULi3L^(HCpa!kH!(J*%r?YrD;L zt!#E0{9=21Zn~`N=|=*emSJLni(#r`b)&T?Sh_w~t$hO*5^o)#0-{EpSAj=Oy%x zJIcM*bC<*q3`HWPl|K077*~@|9kk~t7XcG|;zUv{WHvkUiemmMj%Ec>|Mes?Q6^um z_rBK|bxDwPC-}yhrL#_!HPx@+j$ETox~|e}cd0&6umRjk>}M8?{WT0pjy~+nWuv)| z_Az5uMhUarsfnp~YrsHO=%bs$ip=JS%`gtRIp{1MSL8aEV|V}iNDk)*(FP4>3rMx^ z%WPwkC2#NHjNHgeQH^bk{^Sw`wtl*2+1Ysj2kLKIJnWo&KM(l5`zK2MU-#esJFxKo z4*xCNzt?{Y7nAh@6GBDtBpMSXrtIhcyulBti~J?jQ6yf#C(TaS=&%gDxQERN<)_g3 zeu^U}lNpL`|1@mINfaTL1^k3N>6{aO-`uDj5=KJrwy)(Ns~GZYG1r8#fBZ7k@y+;Z zE!LZbJF+~y#);-o>4Exlq0dA+W|Za|nWe1E4Z-uPd^;6GGX2%c zM)NXUV8s}Brg3el1VK0AaY1Rwx~%d^GqC3+*E?mS0=0-|xXFDg6|@Nnsb@bn>g6OE z*~bfVfKDlu3I^O{RFw*GpeKJ_^ehW%0_zdJp>vUjM;wZ*k6|6nUhzpe=E#UK+ zW^zAFhN8U4ri?2|d8^XvM^W)an~^=Fpj-k%E6XL8-;RSY@#d94E(Rm9O3Fu96q{ETNk{ z&HIs$ONADfQNGs@dbw$4pin~D=VM|UJLYyG0cFv|zUGSj)#MrGaiF)sMZEbuS4IIi zxIqpjhVCWVptEd!ht-$1SBDQ_7V*lHMlq$V%uZRs+ zB13Xh%&n1Cd%kKpagGMBaX@41Ywg}*9r1q2g{A`V=pC@VGXbT>OW(azi1>W06k zFuG)Upv~18+)7d5T=L!YM=_H0X2ygLUxvlkOy6(eXkVqso;rflwUGACx^KTEIq-rO zWSX62sEs~0#-kIs+*myD5C1Opkh6l2joygIlbPo(@9tb7COWzPK-UkRV&(`d0j{l# z>Wxn~^U~QBjsG`Wi707rLU8_-Ho5erdrP+#g5VtqDf`e(&Stdxyv0+BiZS@dlY7Fv znC$KC1c$e|HC#w9zaBMcz8QSkCSaXGJYuV8fE!B8<%1I zPU$C#_)3NcmuCwJ1Z-JV5z=Dge^CI(I>gCq_smiu2JD8%J4{4ng%jYgn?cn&cwh94 zrBC`vf+L$gVX{kn7stc?@?PP#^ag|Wjy|{>o+Z&B38ny!yxxgYba>T6dq*%;91HDP z-e(cJlkl^7m_dvHxTVR&*74H;zQy)ZD`DE7W7D_3f4 z$v)%UH5nmq?6z!Gjpu!19-&L$%6+=(Dey1*pl+ z`BD%q_ko?U9sv~Q_em}kJhF056Pnv7ga!;oTohDAWJR>Vdq~uLDW+kqCD}uN5LQZU zM4E-StO|MX+HoU+QCH%nM?R$^_}Sx~gis|HippRWEcR(n0VZU^gW``$E(|~iCi`-s z!Ki4L7u#;U7Ao7Dc-cw3$6s>*g|6`Z$7AEV?r5KWLk)i_X)R_yVD@lwzjv0zC9 zZ6#@`o(5PN-3nAJRjLAR5Er#)dFthl^80kvay>J>hC9h`KMpeKmAIg>ioRIpLyi-zkS%h z$2dva0MBG+`xB0Y>g3>TjLP~u>WTetVNYzF&v{T;{{t~(V&+!BX;E2Ui7NjIIO2@CDvHu-GW7j#qMS%S4g)ji5?!Q_n#yQC>K zGRg+CVpo0!MQ0&YP1?5=Ru|ikoYfnJo0oKGaMZ#Siwf#{PAL!&ewHUNQU3!#**^lV z{Wa!~fs+40>E_tPa3ot27)aA`@l9A=?VPObTN*RRUjY8{>N!ZsR@Rrj!{i zx1J|^+3log_8*@LhF<2XK{9ET6!QzVEg)P|!-L8CL>?&tvVuf`p)zGTRj0DF8!Abd z5oP&TYeGoXwdaMpM2Fw%0%yVohBciZWJ#W$lAUdGA3|(VA@D#_$rcq7)Z=ouarxQ6 z(qjQLgVeh%E47i+8E@`ch)qL!!^Q#IMPa;xE_L?m{)G@utSS&gq4-BJ=l`Iu_YdLg ze?9AebO}(cNSPY=@x$`|6ohUO$Jv+ zxtO%B0`)RjK`n7ZWceOM^110o2zNpfwlj;aB9t_Cb9z1b0SK22Kjsi9@8=+i^_I-T z8Eq{sXkTJj%IZfr5tOQ0)0ltN2h1XPfCX6%BY%{Y$jMMrtL!KKD7Iz2yUZ@J5F4fF ziP~ok(Mce#Hlmgq;m@}CwrhtgCMo=`5viT*3FdCyxH4y>D*7kr{F(50?ov9MmRS$A zm#l~Q00Rl@muqj2i90VC2b<#sgZ1p0-KCtKhv(1WGRfJ$4Hf#e{ZDg3|Dwkqrw4BG zGMYob$QK^`V?#k|%YF%OP!W{ya&WZ$=so3(V_w;n0h0fPQEicnME}AH6=lG6`mDdHF!HhTQ zN(br%7MSLXlVff1=8%`Dy=2dF8Jf%mzBqFpT3}j0I5|;b^Yd}Wo=#dW0{L=f^LOjV z{MwDjm9qN489?Hqvq>S3@1Mxr>!~c1*A%tOIlQ=*ZW>*@UJBM5^TxW6u$G;1?Rap< znP*KSKU-;m(FszNe4V5Zu!vg(;~KwJ&^~E`aJC~u9(`2CAJperE~c~dwtDX?E_2bM7_3Zrd_MjemPlv z1w{6)ET25OiEp|G9n-p#NfJpCris%ezh5mEk`%7Y7}FnpTl%SdO*qJ1o>Op4w?&%GCkN5fQgGA!_NdTxhDqE_(Iajw{VI%5R;OT!%8PmJ@`& z4>=h#t(S^77f+-BLTnM={%u6)ukF7^$Nr+l{{<~@aYi35I*g6f5W?suiyC+`wYO8k z65iqVC|rE3XS|8)^eSFSC)tKQZ)De#(Ep8e-hQc7Ky7?+KB zH!Wr}>rqaFe+&my=sc>sF3Lhcbs9Hkb%!AzIzE=k<&m{|SE_E|if6@L?pB{g2F8%e zf_RFjx+W%}IK#_g^_AT=7PN4EyHtHclLGIf$P-m6ddHNFgqt}IN6$5UDc&~OL{;3H-*>f6OHdC=82`<7Qcp22C`LQ zA;L#l99ED_bK1&9emNM1wh9T-B8**Zn>Dki`DOkvbD^AQV-6diJs$LyEGD8aO-Nni z|0&;qRqQiEy5k(#=W8F9eDL;`j5SZ7(M8VKGbHa0y81{FG+6xHsQ^$?Y}1f z`9+IAKnrDje0XqzJ9JH!_8i-VI#DIuxb7UV^J~-ITX_X!i=K|0W48};AVb}`8CP^ zt#9B5siESa9fHa19A~%1VOm<1y>$dqN)kkmH9^DD1i}Qf_!n!r3Qb1p# zIy3%(M*q*btp9P|Zbnl>by-E?_OVLfJk9L>94IHwrbl*0RbD1heVH)vmB@BC?g|;> zWir&8srA!(-**!C><7>MChuad@A%aa90e!kbhph-9mhXAtios;cZEe%GuI1Q^q7CF zs|--!n=z%oE}p9(VeX*5M#AOt%EnS&Q+}|GU}KiXFyW+z*mbS<4BAg98Ow1Gf)fHi zR;}W7k7Hrbn>;Vorjlk`7qw1PSqtUDA$(&JCNc2w)BFpo<POhwm9{Y8^)PgoE3fC~<58)=*xYJf&)>;UAY$#BPp5LFFZC$iZ^y0NM z_|9U@=D@fi$!nn7m=rHhaELR4x0B%x)%nbx73fWVmpb<#O4_8;xO(S@OXX?hC3eEHdsFjne8fe8`0di-f5y`PplqQ_um|KwY`9L)vj7( zM<`{fRiI*hCR0yyD&VXCvdkq@tb$eGWT7Z3QRr_sT~dO2A_se45vtbI0RwbA;VNw&o&o`r_wRaccwHqP8?yl8(bIeuaxbg zuV7YJsg6yQdXu8FF!ZS~TV&?JSxHxpTA{;SvwP5Y$ID4aAU${c8p`5F0{R`B;>$Go zlahB$0;D{G5J}a~??E?Q#v0n* zvDXu;Bm}gQBBzWkKwkMBVU`Se;z4n>KyI)$)*XVq4STpy;i({XV+k3$j^b@pTuCis zFH1zaDispM;CuL(YvT8L&3+Yw>41{6`6i?lm)G4h4t!I=}I4s!h|_(FMy2W zKbR)^wgp!s2YsPQPX=+~~W;SaQ7$PSG2Y zzatpmwMV+CgmzwY_%XsbPVW@PhJoS~8%@7l0soRK;J>FIH)0u#Ic;WVVz{pT{BD-f z-PUnaXy;LvF0Y@I>6cMdc^_Vr?dUsi`Ra`=T|MU6Ni__iZ^h3B9R|CpQ_i1jXctU> z)}gveB?z|&szuYNFWAA?Y#dJ8JGmPpcSgz}pZF9}$j?)#Q03_}AmY%eL1rruSI^ot zZa|bLHg3XQWQ_f)KM!c-Q}hy)G`~39;nj+ONaz-#Y(wYoL8z;xiku0nwsPn;J*Amj z>jKaNSKHjnNkeN}+)7H`*Nhl)XNKL*45#j(to+{9biwrx*Hj| zTQY=jnPq<-MY64aQ|@k$(TbBkma-;Q5fRKj&~lwpo~P3fg*v)}uYatX6HmpZ`j9e- zx7ImYq%n*#D;s;A#azM=1b`INx0a{;<_2Y3(z~U-z-E`iWQmi&anilEN+W*v3$GAp)0f((m1jW+85*~tSZWGtY6z(>t4_k-RCDC zYBcFMyO2^SMBo+3@fwwTVxPD<7O_2Mt5Ghb8L~szG4gCUqTno^Lq{%prfWp18%|kO z*;XgY*V*lvt>*k17NixGCvR(5K;fp4kjbWLObYrpH{ zxYE|afespuTHjb9Cg9#TO~kWMasY4w`xMTVm7lW#lV&SJ$f7NYM^OLc($O(?ka)^Zzl#fRdOPf*-~#hSaj;AM@NAc|UgOw`z+rw`Yz z(dAh{-jdQ%(&7-9Jis|?fOL4N&U>xvqc_4@Y9P~lEeGCRXgeW!geNr6htiy=2s!CE zC*3ogUCJ)pTcyqiDdljd-h8*jsJcE-!Zds-%l;jBs901_HoQfltCP=mxOB&`yM4L9 zU}kNm{Pgod$f-0vkyeD_@N1~ci;g=`FF3%Ryn~gvI#_3iz_p|!=_HU25uVCw9>$egQU=I$o z(|sFo?U#xDYa7IaBt;V)TR+?-oS%V{goKunv)&Ob1>jOfyab9lzUGJ1cAq!E#ShFm z^Sp8=C3L~9_0%Zl#{jAxt;t(4_4@EbySpM_yUjN=9^>w;-SU^0!_9L^_aLKEC0c+7 z1PvXV;u*jWSd-SHm`gKY_|Tq^&v(>7vGNE#VYPp<;5i0^=dxg|!)Z1ux29XyS^2;< zR))O(ajc*JJ5h?=N2-3-%gnUW0_h2`Q^mJ&#OGAkQBqeYufNoeRG}1=W*zTSSUumy z_=HqLm2z!Ua}Yd_t%s@p=mX?|Qnk`rTH2cn(K`&oGp^k=zM%27-2$*P4CA&w%|`%b zPfbvm<}!^Ld%S1<%l9vQ;;L$jqD+r5#a~0TLUy)uu{**zmp!V8Rjq7Qvpm-0Wiy9V z4#B~~bI9fO(9^uhRl|D_GY(Wi$J$M$xFcF7wr!32C84@AVxwz2(U`|Tun*q4Aj)eO z;jF320TW)Z(Oot-^zE(z8_75(5`yJ)*zT z@d@WELBCicp-wNe7=5}+7c30MozeXaae3Xn{Csb>%9 zB9m#sx*X{q^s*%C)um~s`7sL1?VOnW{;Xj?`5`9Xt)osSZn4ZHzJjLGO;Ix?#oJFv z&g7FPShxx=GXq{f2N8f2hav9WzXf5d(gokLe!dJTo>2*ud<^%_Dj|-r%Y)1Vbtbdq zqvS{uCuw>Nhrda!QAs2R3^zRxx-kO1MTxt$%pE?Ru_l)d?CK^zqAD)9dFE&eSY{cHb!y{zPJ+G&d%w{BTy<@NSM&6x88 zm?j2P&&J#G+Z&JT0AFJY-=~Kn2fBnu>3)Ok(pw{vSZJgQX%XJa9;bRVuRfcnkCy5 zl;H07>C61&u$rRFw5E4}#;|uQo-LGpDJeeixMrQ_?Ak?m{A6$M^I)KU-HYNcut z_x?$Pi_?qiNppzx%Z9@YtoxK%q$Y^QC^ z$-ysPwcv-2irhiLnkVXMO)F@@fh@847?&nqbDD~uge8e$8Z{$!*a3yT5j%K!z*0Jd zc|FpMBsL0{20O%%G%D|$Yk9cQa?>MwTIJOBImt@UOE>3zGz(%sJ`-Ic>Cl{-_;k9bS0KRDTN>|@w@Zx49Y%dvbrSCZ8M)L03FUlE7@ z7ZHblQ`htt9sgN6!m|e|7Qrb#7umD~5G$7o4MKT$IRhByI_+NXH~MV%$!+EPt30bz z#+`1g)#Ow9t|^!svL5H2QhoR9;hIZYG4{_NIM>}=?=%>9(<&8R|4mk-3po`?N83Iy zc98D!F*E6Un7?yIihOp<7%#79k!k^vL*GglFh%NT9?k{0(0s)kk1Fg5?2saj$xBgP zYswR=><@f$)O!2r2dHO8&M~;dKt1z!=ptlQi!qI^J`avC;(K%E!Tax*@xg7-`qR_W zN9?VRtd4Z6=q;cv-qDPF53UR&K8th*3&ODEv^P!9UT?d--dxuQ*4&$LpOsp5`70<%e=l5_t<7?e^z4rwI`9S?s0g&X_N!j1hp z&%f|3J?L#>aue@$=bGaN&sFo-({DmIGRTKc&PsVhcD61cF@&U|RF83~o<@o4VjLm0 zl`b_(4^}BU)d$`I4zV=ZIgHcuYqhb}tDq`9wSqG(_bW0M@S~o)BJS-sug)y~WIfpZ z&3e#vR-UmvYR6CAbOb@yjkGabO!WTy1owt_J>!`e%bOK| znX}6CkLAeCc2O2H)i21Rj`IbA`5e<&kD3E-$w5gSnpKU-w)I&3KrQwCu8i_q1UK@| zgQyF9Z24?2@Q2=N(JwmMAWa!(%y(>2=RF&jo?KohoUz*4()Nze;%22>B{@rUX0I#A zH zsGH{N13H+$1(6HxP$K|)H*>>=1))6v7)91ShzR_5?MuqgXZXT^i7(*OqWm=pC&n#$ z)GYus`rD@EVkmu?_RRteFX%0J)9yR?kRN=M^-o%*@8~mI9S@*?e@jt+-_eAe8HO~O?d64UV!KQ_673)Z z*T%5sSPBt?>qjYU=N6sV$x0St!LJpUa$TC#Y6pf6^3*blzcGBs za89j?amy+13*OjJ0^Sq~xUGrnH4Z2P#1AJe*H(Mpm3*|Zw07HzH8DRtyN9?xVitGg zP%pj=@sLaRw9fk>;%;D%JCMCe#*k^(v+aQ?{!nfG%zN&T2_^x+)SreH;fg zPx$|BAj|yiYwNGYx$?%|1hFV;4_(RpJuux1<}@QVLmQ%j;gP$fJua%Tg52`qxIW+Ol2{limRg3C_`0C}4FWDW*~lvs_$s zDr#%fb8>oZYArnXz)Ko>BJ7GE=;pq4*cdk42)OKjGafh%(khsw+Ugwm-inmvB)i>% z7?zQQqAui5ZyT8gvjAiE>3WSluiJ#5e`E@m4xb~{yjh89tIt!S&(%Ll((kw_#_mD= zkhAOSNm)G_)%d)DHm$5#|FM;C08X!{{~w0)T#cURU;5ExGb}dJs!H0t!M0#$HTyvE z23R5W<90y@dJut!Vur(FvOX@C6`If|ZiVU-pIxFC=6d-#phN8T1m|rC;~V{TUSCw#ir{9|g53d$H(_n2ykxjOl_b5XK9Ju@a7@ zp(kE=N9%8`Cb7+Spz!ZBDt~w#XWe~61p9Jr7-4=ZLj4k3RPU;m2}UsyQZ#0V=0{F*uHuJeF*%n7>a^q5Z;o{(jM(9Qqm$ntYT103 z&nqd&HqeFK$XiS&MjY-Fe!_n=n)=35mDH??kImh{e)ha8u=x`~9&OzNu<2WyPmCND&YYac#A zE+MLFAP1D1^dRsq1Hk99M)Jfr6L1t1pZ^O&AtkeUfX?czDUPY8A8ldWgxwZsej>L6-v6ZUcArOJk?)B1XfG}yp(1kZqm96(6p-f;$r%%FPjvvKZ*6`s}&EI;$`LJK+(SzKN?7%zojEnicKWHKa-6QkUK14F4oBP2$A@?Bf&}*J^ zeu%exr@UNg|Bh?VeXkS6ooaQ?t{%ZJ!`Zc8CMFM*B9+(b1*K#)WUAY8lBe8yo734$|eLat!k2AXpX>PgP2qX(&TVp;oau;)S;P#7bVUQAE>DzEb*2rjsP~QF6!x7aHU1)?F6kDuEz6!-R)-FOZIM{QDBlFLoEpN?X&> z@6v5hzv!1GeuHbg=L)Q?dX zpE!;w>3bVODB0_ns1Q{N0ZoX(LhL7=!d=k|<hVz$`@ z`mROwxIk*h@HdUZgCp}@(*mBnl-1SY22>&N1yTBp7C+4(OE6yTA=AU8PJFyLX@m|J ztKMgF3h}EB$g4!4eWp&RrTi7`bLiAd^1GBKRX`|>(*yJpKP`aSgfHBxhba+zEWcA=gGf(F|D5RviyF?gE&>ryi;hKEjys6%%nqLJdh(MZSkv zJx?!kkhZvRuYTNvP-vS0g$xmJU-}U?UlxFus3-41oB9*?phH8Rdl0TvJNVUR=?AEh zteV1@GdUh@R^Ebck%A@;z8VHgo6L`{W&2c&zQ+1xZa@2FJ%sHP@?qjI=R0gT0&A${n)N`auZ+3a>o2m?tO^6BksrS3mz7# zcR1$Mj>F22`hl8$_3dq*Y}NRjUH`U1I4e8)9if1()zGzQUm*+AnY= z{U5Y&d2lM1*#NLwNq~XcuIX|6^lAE36>eYW20L5JtJIcNidb4D;X6nY_s!N@zj3T` z3BLz@^^6pb`|BVHi~-L-f3wVxztL?Aur82xH|{~yhV9UOSwI$8EtM$cxl0A!T)3O< zU2pTLSHc!9)o)^~xN`*Is(xM;O$d-LJZyQRYmTW^QTp>1R!-qE0y(EhIa>KWD5pr^ zVv6rozQKiBaI{ZLx*P31txZkSx|qqoC7G_&VK5ve4_x|d0_34Qp+`Qodp@~ zXrl7XuqfR}K#1|ppgS!BU+U?hgEzsfutzq{m*ZYC1qB)OAoLc+VFT@(dhyHJMP{Ah zXSeeDU-;U(?KI~?09^NroO(<5rP=_KuZet_1_^Fc>@m9#AM0r3RzlpurF5(vUA$V3 zOIzi-QSH=(w@qTg#e{u4%V>B2eF67G0Jsvjq|_cUVZ|s&zA1Vy5(ZA37M&87Gc>%+ zKO=;1yd?>bg#5GGDmm({7m&$rU>Q))@nLbD+Gwr|; zf%YhMLeIKpuKM-l)xLqtyKe&o)$T#s*3GF^r)RW){KWu#Lte5+L%I{zO)Qsbk915` zs;eX2-a*iIfs`SX9BqXlK57^z-1)#iH*K!F&rQca$1M?W6HR=2_KvtdF?N05VMal# zkb}$@ef^{FIb+KaUHC1}QVd{pxU*?%jiR7^j|7HnLm96Mm%FsPJON_)On8NMrX>UvR2pd<8Fgd-2|bGH#nM znE{;8ipt2P=`yRopgbic)9qc?`qI%?c$juB40LG&OE|=JB1exW4~Aj(AQ2735boWa zX13K{V?l7!OD*|bgHH3$m@STpnFtVd%|DnCE@C*ygPduj2T=3#EnN7u5xu33r$?}E z9eXf+j3)c_u$s*xv{c-!ZgGjtRfU9S*b!@@O$+k-GK3e?Jc$nX6!$k&n=A)Z*FAT- zcDTL;s_mp2SG?_Av39rvK2Y{l8AK`sMjz_Rjp4|F7iqpl`= zHVk8MBUg8l#=zc+U^=9V)nm8^9VE5hF_nj^uSXaKOh`-_YsiU^bv1Jnioz4XV;E|s z;U4KWF7b6Vk&KrVAv(N?sKACc685F{1BClN&D1W8<>gUvMAJbe9;S%Kp&soYxP;EU zFS*GFrb>G&`ShG`xp6SM?KkEBuz7k$p=q>%Dx>M6r>n`ag%;^vj#+Y2Z}eiq)0L+_ zDkn&{)G7C%yT?Qeu*W9%pym82jA4GJm4hEyTH~cY&!%S)K~R9B$QZvg8nj6zKky$sZe_et&)}NTX{R` zM)-SBOv0P4n!!mZn-BUJ$gCV#AbQdzCSWgHoT3mNHARFI$q zco|Z>b+LChMW<@8kT7U@G7$};mc(C6KsIt-ijVXs%eVpTTVN@((x@h!TbBth9!G?r z(De_h-+NI_<5(T7_>Fipa9oj?H|0go20waR=BsPXZ(U%`PHR7nWyrMd+ZGc+jPCyp zllZ!WF}4GkS?8sxK1@E}GpCoWnOewX_WiR{ihKnDonjPs^A2X%onZmGCW2j$%2%5o z?CXCG@aW}yi{nFEqa(i}-af}K8|Ribp*qNBPLjlXePuCK8d5c zS61XBO%Jo3)U^2n$^1VCnYr@?M4__7i^RGGDF+tWabQ>D+Tc zIqjqWA9wEo)#SRZjRp}xK>`ATv>-vLq9TaW0wPTWq$3Cs=^{;f4ZU{-6cvynN{dKG zdN0yT5Tt{2NT?x%cwg5(d!Mt`zJIy@J@<}%jzfoKtuVru@0;&@+MFcvpe>{}kqh6K zgI~-Va^;&JB+3UxJwJ?{C+BHtPjoyj;K{y6E^1&^Frc91;$(N7!wt5md|HsLEq$BM z7EmGS{Pnx!NXUf8n--`8gLt^>msT5q!U^97Bf3P_xNJF6-3&H$F=-@jnE`@6X#i8|@u_ z;?e9R6*JXkiLJ)r&tXL%F@N zxd5hRyuPJ%$^(|F_?h&@n;!zBcCaRMwYz?keN530trVdUapID=ppn^@H>b zsnogj`S#{uH|;sfOX?!kzC+}@+*0(Lf^%;^<rbng`yN#0BU*lz+fizDcPaTZW1v&sH< z)q#JJ|Bu{UA@QaPqsX<3FQWsc1si=(vsT#A+`36sn-t1}E0}KS{`3X!xE$|Atx=b5 zJ2SPkwnOn|zQ#T6vA8=>vFVigoS;vrF9%A(S84K_CiU8TaMDH*%Q)H?*VzpdBeur3 zJ_DVxJomB9gxI$HyLE!y+iq;491>Bu+} znVgTCg|lDFlc!dBQcvq-Csc>(1BVXf#|&sswZtCU6JLuxf1s3Xr{B=8;+>OdsCHZ3 zvK+SCCk1+;l7p~%2MQWmr(t$F4=w`z*a+2D9n8l7rynqIW%sD8c5paumu;{&vbM$U zZF{?{NLJBQ6@eOXSEU-kd%mC7BAnkp!X;O-&6rJ}yQ+bwGs&!z&5R-=Nan1%v6iJ8 zujrR!%8Nw4OmuyLL!ZzJO!_&kuzq--IT=;RO4`z>qI7?4dqYv>pq)lKR>7M$5M|v* zZ==S}$yXL-RLXM719!G7b}68^ubg@9Ub9SPW|hMUP;8phZk2laNPpxRY$Hn^mo-Zj z`UUZX-a5EIIa-}63!--3^D|3PsI}JmS4D)t8>9XR(Yk-*<`@@Jd7trwxA$KYBj^^R zsu2P>Cz{T0g*(lYVmSp6g{NiZZ19F=CgDq;whSKkdhZ~6qK?O=vQA(X(> z80uqmbk9TC!)i{nqP*jiX<@#JBI9GBtP`4`G}BG%i!F-nPj5hH8Rq)Jm_x^<3CXE` zlXkHMIvmT0sXNVUk@y$`kx=b;MY+5?l6`?P0y1>?7dca^S5r(~rk{N6Y&x`;r9n+P z=JIAtBHb`QOFp_-*z1O#^&Pm~|_p;l( z&i$dB$WNKtTT};P9LT@JPUDOi2U}e_ufKp4Hf(f)?B^n_BxZ)F-*4L`r z73AEx;1_eA72HG7ffE$M*y5{!t#2yEUu~$?3(pqqSE`$nBRby}+5ne@A?`7SO9dcr z5l1nAy#05TRJrf}C1S)*kHhAM@FwT6OQE@%MZK@9-l?m-G2IZeVU$nTAehS^{E%bjCn3P)HU6*gzk4E{|s^a z;9sO-`TNHo+d8BYRaP|$FP-#1%>$8mfmHG##|u(yysL;HT0OfI`ruOWP&SNTyQ`aT zEA2fL^4#;59=wHkV+O^D|8VhYSSj1YGE`+)$O)EJrHcqC>hL*%57u_y_j`M*cgW|) z?n>Nf-C)c)z7e?YUCj_(7Kj&_dX7^%89r{dJ-SAmy05^)jlgE!X)`w=`q=Bmg=e(H z`KhW)f0WDq$YXvqC4L^OK4er{iSdEia+FuFh^X1nQFAPB1q_()O0&IRmve&g#0n% z+15oDU6c8l^CaSR!z#I0ANwT^JVG~ZF#(KsI(yCRX`iT(FufN~ms^L-n}GIm!Msl_ ze~BcJVbSes!#Ku(X5sk z=s4mjeE4x&B-N|GUpsg8m^ZR+4ILsQ4TNzE9P-!)Q0Un>ZCOX%v0soTCnsoA6pG?I zPk(kw$)jnt`197?7Vt;x328=LF68lQPQ%)CT1_-N;aqyD&Vzg}zoz4jp@i`G(YJU(t%G+P^@C~hQ2k4T&k%3eONVw1TtT7I+Z+BvE~Xgddr zRiAvPX|;E|_o~Q#*Y>#Zo23d5%+6zqXFfMwQQ>v;=8v0?EW6`DgL-AK{>}y9N+zQ$ zX6k%ePb+ol)7p8~Qo^MIuXjl;?$>@ypc7{Yf`m6Je~3u1M+%>p7?sp1Ki$v^=|qRh zh}#?+X*~mFZp6s*CMbFH76pMpm7qPjMqsI`}0mr}8$U&YR$u zJsdPkZK>utpAr1oX6csLvh1VVGqeLG10o6Y4RuugHS{@Gl7lRZtzKK9QXjqLMNY2Oi-?X(%XcK7u!$k(Ng`l2oZ zDN^{x&Q9wT1r8C^h@xLUIG=n}9+ktbtg^B5iOcG^0QU4}{%+#WDYkr1uBwR3feL)M zj`v9ct5nhoJ^@GaK0;xQ`U$pvu!t1dnit(3l-#}A9S~r|`0TXou`GMyiIl}lk)+2| zQgpU!_20ARr0RGFva$PRjuxXXW6erJF)39QGpBgAAF7i{Z|BzfL>X%|P1C8)#)W&) z<}%O)u5i6YdGWKr<2sgBGvAPJ5%@B*t^7%2_1nC7cT!XO(VcsTdUbin#i#`h?ACDz zVm~-(yHFIKgZG^4jys8u*~{lUXIoa1Iz{&j;%eM5-r^34eYcigU6r=I>7D<5p|bkj zx5OT}y$#DA&#>+^kpu7GiDNLa?yfD5vecQHfqO& zH8zQyblon@6uHKk%RyPy>KcW8F-NV)?xFq*VsoJ}kq&j`5MAzbb+#7XG=AUEKFI9s zyKJ+~jK#Qv*KtDaRXnK}P-)&7k^&0wZaQ(uL9`3TIr!-QCEd6ThiZ+_Vy}@|grtxU zxfWt+S$1bfvb)QbEoGOR`-6~dKHhkfG{}hEmW#vn8iY?Px6+dhJFW}c-RD$Jng{w) zNq@Yr^b#`w`+|z)sTj7x+P}YPw{+Gy*B;^8W~kg=47a+@xI2VFdgJ71uV^@5}8sA|suyGGs^O&F&pAUcG);k$AEBQk^mt z)$*f2#H1d_+TvrdUfW+%Khv+j;`t=e97p%MWZx}x3Smx{Wwzr+T``TM6JD&% zl{pS>TkmHZ%*uD;uW#(2HSp)p`uPv*yY-tGsoJ^II2(24aNcGV`7A9*S;&0huu45r z#Wd+sMXCXBOi(*u$>&WwEa34VOgDnlm~ksHofnO2AUvw5R=|tgja<#QZt(B;)&i*s ztXVo4J2K|Mwr^V=p3@go26t1YT)%lKOF_C>^<591g6P~_8K;Q9fTNH?&5~UiY7zT- zo(|a(hOhd5S(1_*@u0cP>+CeqtA{j8(p73EoO&_aBmK<3H|{_lKrly91FkUC}!Jw`NLkw*+XAE`GW(^9o43{9gS_RQt=x`Z3W>L0Foo zj$1u$19}v6s1ufRja!`V6ITxlJdpZ~%-_}4159HQ<*az{X-N9|SHJ1qZ~7L0GbtZZaemO?yTTGy*7BBj^HTT%4VBC2U67Sgqk3#FcQ2@%;$Z=MXW z-{YvoAgZH}m4#4~CT(@LAA%Y2W{IBjfm8gK)8xxgQV^F@2iVpj|Z)#8ITh zWH$etdQ>jrx2}A>TJaaeLauXfhE^F}?#DtD9grgXAmm0@*p|y3;QUd!z9y--S7IHh z!0xLdx=Gn(h-gHGvlcv2Cd!N!r^%c2CRmZk6Ps~NPY-4^S|knb2Psulw_}Y=6oC@L zIgT(qvX9iiaM)91J^mrUOK5LWVoSZ+UQhB>m;K(jm-vj@1N4B#Icrw^2$$ETlWuKn z6a0#-w<+aUbMrp0D^j7O>%@p6>9bR13`SBZ5@*-Mzw`PT-eiCE_Tbh6=M3k}fD=11 zbZ-*uc9|?M%zRW(W0WUo@Vux_sK7 zE#=2^D(OZh>?N<<_1U=tYxgPp(ereyjMo_!*i+COg!)(= z=>UUP3aPOi@Gy)?3$APFWlF@P71TPL1CV}HOq6tj-DF_6*0vz*5)wQ0mN#-szea_p z`p$`*O4$PcCCOJs5@t41OOf;b5XdP=W-W3irAaUyrh1&4r*bJhboPaf%baRC<NQS$QHZm*u>gH1I8-P6D31Ex$=*G?+rm^ZU#X$BGdrrqRJq&G&sz@YxS!yR0ee^+PRehTGU!gwt! zFgghy8KOF`2a%dUrWf>^%@NP;`$3#@-|1AhPD209BJz(9nKrP;ot zGlr<;-8_22rN3`zqv~i}L7wt4!=?}~qUiRdMOc~)?)hQNAq{j@w=_v>r;r&;2k$)G zH;iT(^6?VWE$0(X)eO9St-NZN9tQBW0o<3|Ye?27!DQ%20RPdtPBLA-^2rs~%7!@J zkhE*xZh@Z7od!HqS%Wo(AlP0tgf2uE@NA>6SUj9XDQPF*&HZ#*{Ne`;NM|vHM9o>_ zt41X!<$di-D5M`G?C>8@avcjk$+qIfX}1c_^pQOs$2^>i>#jg<-Cq!Z9xVs@2B7`p z=`wDdw_o>q@gv=fz_4G1)2tdTt*16p_Q}6{WY^C@6O$C(Ny~OA zC9J$#TCE$|vGYQ0@cZs>MH_ymQ7#RoesKDzQf+B3KE17`=@Zo^3GCBETp%h9l^$iN z#e5dWsK#>M`tB#c{;dmI436DWFoE~?TPH3J>=o+YO@8QNtbU}PtV<0aSMM(g=s5Z5 z0(&a3Xa?`JA$D)#!H0cH8->$JCEq<4Nj&8Ud zjVZ31`xj*TaRrgKrVG8s+M8e4%~(B}>-(|$TvE*DlFe(f1%HFLv;ra5WL_P|mVK~w zIA^UT-p!bRynEvqtJx+U{S-B$yBp$zI=XBuieT8>z-<1oNjwi&*v<dK<4e^sfby@<*?H=_DqB2)>>g!fq`u4*bpfwxM{eld7HxMJ(_G(R$vF zxQNcp{|OtLC*4W=%>>+>?lmNMz4%QQ_zTWz#G$qy`hXWB>3=f>2gZMe`KjcAMa{FY z;INn))Q9Em=(06GX?&LU;}6P%)Ve&+Gm7PLZq0Etq6!^p30(NB)_Dga-Q1{ZcM$s3)6#HAMe6Y(C zaipx`^_O@d%gPwL8kJ(9xvDO>hzv)R+UNU9CD&0^sL6C#_)WOec}@=`+$f_&y}G^< ze?4f<>*v=3+wB``)#dL$CQC+6J`#;cH|Mn`LRuxs4%SIR*u|O7>xzP z#H-96_WGxp!$sZA`md~;1C>GdaO!hg7Bwr`dS+&?r}$G4xUH*4VTan}yUxuRq+P#g_m}HSP2BPFIh^fZ$`%6CgYE()*7X@h@VEVx8BfjAQLF98(X8BXKI&F}lkURw$)N5u zm`^MT9#%9TYM0DN7cO)8I7hTfv@eQKP{d~Wo<=-e#@QO-3Ua#yj>CGV9#nCq(}m9c_VY&*}m6($s-G9f^_U>+zL=SX%W*fhmIAf=ysX)A0`(0h#P!uG^Z z`RLi zF=@#JM?&!KZC>kXyg0Yw!mf$BiRatArH?nF{ix0te`31x5wNU}&j1^-kzbb|G)@KU zQpZyrj$^G~^dNSLs4p$}BY$VKug!%2OKIq6cgy%1Zj5Y$ z?t?K1?TGkai*(N8|NUydd#(x*ImH~+#z{7^8$r319o%V;?^Jp5^A2K0)>jT*>!Sgz zT1(NWWjh>{MDqdIEk6F!abh6sd>!%noXqtqxmGDvA(|4S+p6KOR2X9OR-fl$xmXp* zbpAGa^s!&0hrX_wd$WXIK0U=%JF`lbl^s6BXqjLy%F27l7i-(B{{49P>a!awjB()N z_uCHSDgJLkwP$$o<{b}oYpQy{v!CMiZ|@G$e)K=@#yQIre$eU{Xv6CIAt7gcW!#0I z;$49)qh>~fE;&TM3-(1ZH3dcQ%C-b43r)LYRiZp&?mbHd+67j^>fa8A|LTrg95}Yq z&K)ss6|NMgmneU)icunzBibn6Z!LRnnV=XjLnR9A*cxr_8vC4w2Ih2LK_;i?`=~o- z-#uWFzxl>dC7@4I!-aR)Z4V}o)PHHoIWE(be~>(7OB|Rp!`Ag289X}>c>+edj6TxP zoAcVsWe61-idl1Lu5PJTAoOt>-q`L)pF-k_o`Vs$wa;}wuif24Yo4T|x_zon<9DiMySAEh@l02N z!!Y<*7H|Gb=S~FUaZ4=@H24b-Jc1rVdAY4o&U40ks|0D&voAWGFW+@I z$l`K&U|`okp2D=x1o`dmVM>|vCBKi8iE_@dbt|5Hs9tGq+LiVVFfXgXm>au^qKx`q^#dqae2kY#KObM^HWXtlLvP!o!F zWMVw0LV(!Lz`~|3zi6Bi9dsR=pf|ct(a`2D+kTD~@x7CC#&rdG-*321SPe0|QJ!Qf zQaZj9@*|gI{PL6=X9f{8u;kfoK4o+7?B`qRE{%wN{W4{D+}u-Eoyicafj<+1Z2GcX zs6lP*E^`TY`E};QQ#)I^2JTm}{I@|H;{VZ-$RAE{-dDpW3`q%k&Z$!@?gR48dZy4a(p&D*Yd{<2;!pNsv^@r@-E5$!GkLcOWprR z;3aa1Bz+MG8mJ^y9rz~nuUVIWN`?w3`dq?IlT>-5Zv28Ez%jiXz7~wuJDD^g!Rs811`2)W31p|O)A*4Bu`go;Bf zKNzXs3I+#IJ^(sqfa%9b$-P%@H2ld}8*aA6C<#l?QfkKx)@A27%76+Uf}HOoFp4=G z%_4p~t@ZIcq>!^lB)kSmpza_uDjE;}^H&AO+Mix>G!-dK*$Bht69HJf6AF3=AKpJ8 zkpO7cQ2p=oQPZF;Yt^6H)pV43c>JaYI`@upP8+$FdUikEQuHu$$4VFSRmz<9G--vH zzxwo_9Xpgw^?&it8Xx>cr35#lf?qpqn-QQKr*hJB%+SaVblvF}XlmFTmah0p zt3-ohD9eP+ZLdERssoQCI=p^2P2i}gjuC(G3vww1xN&j69lL5|jrv2V;tFC=%|ifl z0JwinyonE*51>xE{kHWOpll+Ir;Kx)3Z@Fldh%65a!Q!TP;T{)ufd< zbiWKjpS8sb&JnET^jy~1dbCJv93q3>-QQ#3fhUDG@U&A)0aF>Vw=XB(dK}#;RaD#- zlwZQmM^rgcw}w~1ZZ|h@vPDjbs<_$=OYcp|*udC7O2@m9<5pEZG#sK8j?;;rA6pWC zjE~ubPdB@}(D-|8L|;30Ho9^o=JAmqt~b!Rj_|la&>tPvs_cs>s}GPt493@Z>DJ}2 ztUj|(dgZIT59=0NVoge#O7Yn@%qeV5n)6*2hF`CfcJzvKYW0iPr^Zze-A|uIMF@-8 zO4=sanu>lt>`qJ!kvx-@u{qI^IpYJ@{P7FYUQ@pbThyWrk{WjZJ7S?H@XnwQ8C+ll zEa)#43ZIRN2i7mB^`q9BpvULSO0;e?p#B0)kh*?<7WY!=F+_l663b?xM^slyfg=Y! zK4-D+q`mSrmYXY6ri2NqdR>4M;N zs0B^BrR`V8tETrOuJ_-2>%i{X0UxcE0Ijcn8;r`wX4ZyBT;gtONb2s_p@7%QeJ(JG zbUdwe4|t$nrJ2Tg9s17{c~bt+U(vO&O=BN$U5+*!ePQMoc7get&^rlN#!bp43#NVb zsv{{p!vp=BH!;;Z1WwuK@4 zTK6huG0 zZP55?spnG9v)Xx{X$}nM!^s1wq`>qORy&Aa+)B!R>W6d(-`qz^&Q^|r${(T^jg>pN zMp6$`_7!n1O>0`%m>*v==-*5*No)T+-i~|rHWm6wYNNipw`VR(KHG>Br)+@lph@qz9s$<8Oh* zeYpOwa#cXILwavFSuz z-z^1ZasX$o$HT%%jMmj|#kQAncGVmQ<$0&nb~V*+)2=H9kMHhj9?41LH-RQm-nd4O7ON9W})BFEN9iuDXA%u5*I z`r2w<^8YY3`aepO{ww}_9_!B}+d>=Xv8fNsIV>HkVI@!}{pGyqx2!LXQyK2v`*v7j zGH73V*2I>&afkO?tb}q{7PNb(M?Ye1O3HaQ+($LX(?`jh)?`Y13;^#V7*;yJVKLHH z^|jq55MLr9C9W+hqszK+#ckS{IdFr&-l;5*Z0_q^w+3TTcndnx3fuMuZoO7W)vWbm z23hpV%db6%v0Tr+0O<@YmL|3gBbDX?jW6TnI^e}cG=lcPsfX@%03Yq_#fn;2pisIA3>AYXToGZ2MMWYw_mGU^svkU_ zPZ_h_OS-2j`1$C5o>!4hynk#Z&)YplvaqAO9@W#W_4G>;K6Uo#Ij#FtZ+$Td#?b|J z8Rg8<`{quaI|7Up5RjLGFzQn6rGIy8-0~APy3c(5;X@6{w~nXBqwvI~ZLxh{53BX8 z+0&wNrzbr^_oY)<>8CvpZq3q!Rr*0x0(ja=wXS>?z#>z%#lVx{I(f zir|!b$};f?RW_B4O#?Ebpg<@>_B29H85)(Z-Sj1!Q>g!PQeZo0&eQewj1%TOB;6FX zgp&zh#1&7nzK7-L=tmQI@RRRtt1lI%MC?)f3L1yp)nfZ}(&e0)F2;ly5|YUu+6`*8G+%u0t-KI;8aHgn#`u;xTnmvX&1PvL#Lh%Xbe z_qV62+9b@RzotTd-|ua>2;)8X+ckZx>*-6P_Mh3rdv?yPAH zn{A`*E?HmJP|^;Hv!5S}Jmx|tBA!m4J_~O%XcAzsR z_?R7h*W?6D_a1?NE{3+Rr4Cjq5t{o7w zXr0X35I%J>&+{Sk_b|k(4K=EiMj3cDHZDrE$F1F2#fDErkydIfjy64Vi4!lKzlEk* zt42k*NJ=9WD^;V|vZrEeYjtA3(aKTA%P}2}822?zM|kx$_b8-xyr~tiJTk93IS}H_l)l^yJ_v%tMU$FXxNc zJu@R2pdX;dO(*=HpM|)>m}MluH+ykz%@-zHS~lEMq@MHh_Jh;p&#aqonGP?oJ6^|d zSozD`z^k?iC9in6)M|!nT6|=R`H}X9O%l0G>nn>6SK^6i`Ijq9B}AY0#8D`K2x z;(4?$P~ay82F1Y0;?{8XB*0i-Wddws5}(DKtPoMj4(=O8?9$%a$q&-^}z zzTjTEc~PK)*MDkz`mITi2Xz^J>Y(Hhzs2Dd@^qz+pJ>-#bJ0OM?S+tGU2S0g$*a}3 zj7If2t{)HVz~qK~P(@EMgiW-2-qVAt&b}@`Rm>es8fd#HA#Nv9PsIEoXl(x>XsG-< zD7vq-oc<@=V=#3FG{WMb?xLv>yN07Z%6(*iu`v4T9sy5RNBZpa(HYdnX~9K9YbE5L zVUHMAym9(a&;$tVp92BUa0Thp-v$^lt~2wbYrvE%fO9ww5=QKj8;5}6!h%{z@~9v+ zroevd0RioDf4slazk#;EtSX=h*r0wOy1=lIlt9#idOtAvcEOGzdLvKpyrjDuP*Tl% z&`DD8#_X;g8#S@EhQ@ebiOrQRh_G{Fcxk;b?)I)Zq4zd>>djX-pN^5Z{FWD(YUUnN7It1%oSUk$LdB;kT-L8Kv_^sz8vVYl{n3veDg<9 zwmF@5pnw;CZ6>|VriT8Ubv4o1#KWUvGwph{kyM%^B2wl^3104{KT^kvlZlw~zV4dw zG&u#kd-(!6<1I>>TAl4%T%0yEW0{dHIHf3281=(9?@+p>#nFx@zd$yex-bY`QNxVm6jo3;T-$obPTn{nxz>w}9JCJICGSPn)0P7gwfF1q) znK>Lo2FZ7dFeGWyUl2!>z{}l_g{b*a*a7ACCyjgWVctHV>-w|0<5YhNv5<&IkQ&_p zH6o;Mp>^&3-&YAJFp?b};sl$~vl^UhUXq{an;i#bmD@~F@2>AsI*h0}>+Yd8)Ml7R zT?~A*bDB(kLGDNA3|m*(*md^$s>r|Cy@KRBLDjLKdl7cWuWJesq*nE&kx)8!t7!4Y z%C__K=T@fd12w3XEheIZe-y{z~g zT3qN}6)5&&rzL;&5t*`i*_%0O2Mz4c2={zjW_~-fXKxk~;$|RwJn@iCWu^{~zIn7;^B4o-W{vKR-|RICA>fx(O?b_MX~I;e_^%UfH~ zH1U#&>y(9fW110%5q#_uwyJ$zu<8D>Ox8@&$NR5&uCeyt07%(AzqtNPa%!GGD!2Kwnfn%IDm zE4_su+TY;y4-x>|`3P`PmK?Q8oJq@~diZB|u_M1{mgj55Sl?1Ubh{uz*7GQ0tpuMl zdv$#+4A|6hR)UsuVz5;}4A8+Q5$_2YN(Ri22WT2yF+C0x`I3Nk(uGRpzby+(kT!~0 zwW4o*BVo5bD(>`XLgu^KZA#48kv@9aut1_V@R5A8OTvvV@f?q62ctT+QHz1CQS@$3O1`JHDk#`EF-?{_PyE)4^f*=x50n2GvhpvR zhJ2*diVE{`Z;>WnPf^>qyF*j^Zm!6XSvs7=BV~c24 z7Gkxx$5B)!?fHffjqjUd3XXM+{$s}=+<+gdjHApf7fYwchuwYmeq@(JuLtufab(|D z-d`uqQMmx_47qhf5&HGaV^xJKdA0K|zr?EMa_F5Dfpy9i%{RkN6CLJmYFlJH8gD=? z)@7dk;&)#>VUsZuGMhh2Iz{BE!qx7wmRFEB&sx86?`f=qBehci}ZDcX5Sb36I+%Uog+@rt8C0&J)%?sLq^W}B9Wq`PSSad zOQHM25m)2O+ih(>doG5mevDRBguV!QcJrFoJkxI3-|CG11stkLzO4>2y!#4YGtOJ) zpc22SxqEBe_Z7yXGsUlbko@bj<({6RzRek&RES+~Um!Bxv;c|8oTT;^3d>T4m zXzdjhXD8{!ZwLDJoU3z+sg~qy?_QO_zNJQtao@P!^szIKejWMc0O%+cByc#}{@}mq z#d^Bhap#tW9R6(29yR_r&cq$d&~#-M8cOi39rhMu`!O|%3*V7(V@(M^`8LRl+Wun*iVGZ?zMYpgY|HhSgKmwNv>gG(%e zTkR(~Tg*AzK4GTQ>HGkb*`9o2+PL%^E|9Axcq8ous(LDY{cX?>{T>;SL6d#$BxlA@))iKYnAeOhx=$!fMnsK`RNU3{ zRfY@h`EHettFXW#SPJU@eQeq_PPpu1lkm<+aVODcAiijKpepzf1 zZ!~id>!VUuSgIn#W&dK;eB$cywj7G+<}Y;&HelaAll4h#*M0pM`_!ts45tx&<-Vrk z-SYG2iY~kfEK!y;`$E6zeP*kUg-FxybJ2-5D^KOPq@cY^RlqKiVKrf=@94hDZmZYE z<-fJbe3}0LQAn3z!UKSC=?zegrhcc}Dp?8SbJLA|toXP&_1XgK{hQCnOjOxgwvF?y zpWxAMx7WeIL0>{W$9qf;$aNwVi3Q2hxWBmUYQ*;)Kp07beA;#!pEx>2Q4>oM)>(OO z;gio2hYS||Jo|5N2CqFoO-x4qR<~My=kk0pNI{|@{`NeQ-bWcz7Q@-rYed~?=8>ja zrpR?S^10;m52@r!1cJl<&0mms_b9~L9N@`ipYAx|A0X)w%NKHgL1qA25Alv$pPrFM zHH;b({dmd)DQl9M!xe75q|0kq;*f}BiL>}^-1R>85 zP4V8&*n^g>XT@KwQ-qM}Z{G&i-oKBkslB~=&DZI`&H|s_Oh88L+QtYzDpmb@A?#aB zBV`Fcdp>Q;b8r_H1(LP>Rw33*=I5usPap>^6X%<>g-c8m6>19LRncDNv&gmWbTbpl zyyfwtICy*hieX3)@6?f6NgL&yB7%K{YIw-UeQraTj5d8P<_Sv}+1YMo$S&JX!0t!a zIA;7C!|3*We3;(X@V%H#VDr^f*2YIJ{bP_|Vb-!&U4@X-^0<1|N6}!4o1?v}&ZK@e z*#jOj*$a8^bp-q&%M|ViS`z0J{;T?EnY+!EQe)v!8GKlNL1LhFL5vh+-{q;Erb(2mFLfg4H158`vzV-i*3 zf~%ze1%a)R7#j#+5Z8aS_Z3XjjQ5z%)(Q+R-sy9&VbtQg-7El6^vf*GQ}lpt06Co- z6^2^p2LW>)bdv4**0^}1#8Rx3c($R(b1JLK^4gt+hnDpIbw+6I&88YCiIKTQgK!Ek zf_TpQSv9I98CrlG4Y6Uz!OXuPa!-$b)~Vqgutx2yye&}ZYW2I2w{Cj+wxR_ok?YJS z9VPeB1t0&G4R!i6#4Kv6nhviv9<7=|(AjuB7o0MAsCrVMrUakbDCQ-C04dD-hHcFr zDh46&$jEohO= z2g$KvY##kv5{B)oo;1Xx~KD42nQIql9k zs>Ix{_Vjz3Fx)zhBt>zZm9j-*?Q%2PR^fU~?t?erL$AxfzQgn3A+-uWi||4TtYt^@ z@9B6!C5^$oD{3!oQZ3*67;PRWkE)dHj-Q2S=Np=&d*-huSKB8&JMfAbdKNedvQKT^ zSO|Nf2l)$=+?g$ZnQIvHNHs$~7eR(@v2MyYoC9DXh-utC0&=##*#U|_(epwnAgA^v zne^_G%a3_g*kLWm1x|86l99Oed}=eb)6h!U5bGTFWf43U2t zn^DJ&>c0Uzb`8I8{r51fe@3Ow>1TJ}6jC>Eat^vO>C}8F4xO^l`$3-P@94c-6r24! z&-0IkDqBdnj|cv?c*v%(l-#Vh<%{NC)jm6x&SL2fj{AU%{|**E=VEl|sJNnohFQUI z)05XIlz+z?>#FfdM+oaW?le33<{QEhpE^`)OvR1vwSVHf{`m;anStr6p$yZr+B)$lS5G^QA;0eHP@(VwmrZ#-7{0 zeFdNfnq@P@i^s!=zYh&P14+;k2zdbEfH0PQ%JFc2G_q*0Tgadb_De`bsa!y971)vg zc&lHC=iPU~ozE9?_Q?Es<$U+2Nbbv34i1~Vm@B&(&`a3*ussbXTgMgFv-!qI)g3!Z zYSsgHhgr=vJk8rPN6xe2%@feDI*#*t--N|ZcO-7FM_)j4K-PmYU=a&13`u9OZV@xJ zJT@fQ7bA~|NuJ=%L_N|@@nYQehB3w8PDq&>f7J{LM)R1abY_qGIfhjNa1@Rz_OiIE zx)i_bdXxq|YO2}#b3xGwc1`~)+?=T|iEBnNc)*QEH;F2?4kz{X23qMOXDvGQo;b(8 zvz^K_0Kg{W#D9}tryj|3w49Td5K2(aF=jEpCM}01!}Wz)V0#P8ul3}rs~UB9d%Zt^ zfnjCKtbLf>C_CqSX-#|1>_MR%nqRLSFfA{REd_rl`qh@9<=Q38>$Ovy5B!MezJZb#tnvCkAkJHnO;k8)t>ZD zG@5Xcda91D3<2r6%PalRa;jc_8pLBM;qU~L(`@>g9?%xfCx6V!zQq#I=X5-c&hfaN z49MrEC0N(>f}|MXX_6)W-X;vWYvV+`tMF;1@buP$_g0%`$`dX3P2Ai=Hpz= zp_)rpS2s6`J$>@Ls$kXfyKJw5RzvIJakOZg3~PNtc@}Sql?8d9pzEQIRh%|5jl?x7 zOq54IW$n#$c1PXN;99R*&ibxHLFCT>sRjI{S^0Tr1gr7OPt|S1$JJZ#Aa-+8)Qq~$ za~j0p$Pb`1*5CY_#X zhaXU&K98gN3SbB8Is=)CU(IxNcQpgl|J-Z;Pw$mjgNrYJ8e}<>27|OTWtm9LpgocK z3_d_@C06kD6clzfBV8H)SDX|XV|o9*@5wAb2w#HkA29*qCk61QWBK6J0L3>CqS;TT z`v2qp$-fNd_+P#I|E&K`_6@(&g!4#I9e<-mVlS)7#Z;%8kxtbx4I0@@QNJ4JpMQd8 zi)Mm<6x*ZloNm1e420IQ<85y^9ZZqaidnDY-G4zmpx>NUO!hTidGoFd)VpphfEF>$ z5^=h>wm{tIFijtL7pA_vkN@pf232XdBx1fz@vVm5Y(hd(LgbgXfwWX&uR)x2sw$Vk zx@64fB3c-2wfIf8Cfv0BpdBoDIVcLOk-uM;+O#S7#%e-K1 z8+(9a_rfd-r&~GX8W=r%c=z-8#NE%WtTZPTAbjNg5DK5gBW{oTunmp#lXssxo0=Oq zRfHzB&oMcZx}k;yz5gERfPYo1%zwuF!lg&boRcsfdP;CXE#DB|N>C!?O8oYG!7Zt_)Hy$$eP*A1ux z%vAHJ+d+(Ejm3>2E&tf;p8k88;FmeCDh5pct+$$%FCC1gE{tXmY_;Skhmanc!CDY4 z`ZxY~>;(p+=jJbEi3k5M9@x4$)avTMj=kma)?SaJRmH^O_ikG&^((^Z$F$mwAp213 z7gp=#V3M?LP$2qwXVKe5x=Sp{U=jqyAGJ09i_G-8(ow@j7VSYM;Ec=)kulw|Pr zZielm+>bJC1p~6|;zR)i5XpD>mH`5%?EOH|PaW9(sE|@9sVSyY=u<2Pj#Af&Wd=+J6_+`QPsQ zsF|P34fgv^y=M3fwc;df57+$mk#dUssCf@2XA-C2V(N6GSx5KcF9@xLVf63i>D>b= zgM}@7qdw}`gn`Wo-JutxOFI=?^h$&jM5o5G$EILp#Q(+KdqzdoX5FGl5D>6HL2^dP zpa>$d03}P#$wG1v5CO>)L2?EKMFk}10ww31ksKuFoQhabMZITZhx7IO_IFQz-F@#J z<1{~NFvhMx)!uVI&suY>HRskY+gQ-Oq3pBz;)Hp^fG*2EyX%&GW7LjsG3n{lDikioK+l zc!?19tDN3YgW;o%6_p3KR$U^#tx;0$#7a<)=m2~$fledjNN^pBph#JgMR7x}K?mjl zN_O4^8j11dot6==+V{~?1aE222m69Gqd_Ht>#}sIQDrv&yUfBtfb8)W@AmB3NUtcX zkV()pv&##)u6TRmnqlG}KJfSd(#ZbORc)fl^M$DEokTpAFx@E*K;;{^lF64f4R99v z|NhYljBYur5IrHBWf-eHK0#IFyY&jk<7Oyj(5&4Vj53iDJbiO zuG$CrAC%X=ed2at-xbL{rSWzXIN~-(Bt9*JUytXGoM$vxm8JK1L`wlpmL=OTuRQ%L zlZZ#@5MI1(MVg1WN zxN|luZkJlOw&$Zd3w`SvX4T4V%^v{Sjb=m=E_Uw#(<|l<@1s%QO)rQ%w7dpH4bnMx z1T8n2UV}w-io71dy|YT}%fYr$)~58;#?#*}XAYtdhiYm1F~|2(>!7_y`)(P4C&t{@ znL%l?1-cLaKd}pca~X8H=E`wV(fXi^OaB;|r1SQA<%?qQvc~{)n`-Wye^|mznpEwX zY@JUJow%@aDVk(t1+zu-8H5-lYuqntvh8oJ~OSrNLZ#t%<7mKK;y#Lr{^3x~WL_rBEplGSZB< z?wzrUvtIT-b)ds-P)JC%`1^BwxCmOlKNL_qpav>dX_F1&af&J9M z&f*s$zdHi;MEOYBIW?3q)Xnvn zUg7k>tL0}y{-}po?Z=g+{o0<%7N6{Q5F7j#7KaTYZdz#gHm83>T{3#U@Ab-5@pk&a z&C&0RUEu_}O=s@V4Z|JphuW55GtJrGYATCs!Zy3zxpz$%B?5}-%+-+|PG5>}(~Z##QKM5jy^!x0sDxzEN9!Fu)nzW`+X@SQV8rPfM5%pY zgaktVaUp@OCgPDtioAcpmb$XR*AX>o_ch&reHFv?iwk_Y>YiOli)8uCtMklM zd_+g8S=(mwo^qW)=8vE-wjw9heuOq z!;y8C?|v8?Oa{R@KLw_Jj)<43nn(;*IxMg>al-CiBJvvn6n?VDK12%@tlx1 zNn@QXri>QW991X@H;O3Egh4r-D^Mt-xo@$-r_C zbGZ1R)1o;La_MU60JvJ}Y+Op7G34`}FQ%O1!TNgo5l5dc+5vkvSzr(C-J><8>kX z4VN-^1NKl9x#4dZ@M?@>Va^{%bCF<>3+7Xwd!@dB0nMaXJ#=re&{h9Ux_Rytcl#%I z2UjfTIDBnsaLD6S0?j`V&xX;VK6l+jTGYF4mMl`^*_%?@I>-0DgErpgPI;+{Yb5P% ziQ=6L1|LxaR5H#u7-e6*N9cCgJj<5YHQ+KuFco2ca7*=W2Q&sXQ^$st2}=gQj6TC~ zGe9>0XJt^PKKS=Xe8rywz=fa^c=3TY44{0t{*g22_eVHBMAxkUd5HdU8~*EqEU^gS zLIxNP%lfoCRH93&zL)|MQ%Q@wK0+vw($B54jCvtxPEWlo2y%lsddydhC+uJyE#@Wc zsztS(C*N+M@+bcw7hzBJ$qfxc|WQGV_eMJSJ^P(PAo)mJ!-4Vxsk^M6oi`a6ZP z|Hs1A|81`aF(J0!h;*X7OSEK1rtYiEHxfX*9i5f#)-R`E&gV8c-J*Fd;>ztdfd;2uQGWC!> zmUC$Qg{4=@pyZ_C1JTK_;xQc?B-M`%2r}yq?m?SNG6})~h6r=!w!N2Pnv%S@al;Ya zove#aKHeHByy3X|>6<8G_}xrek#oxTAF+2kLLAC0G?w+BEOU}HxiZ9vHAkb|bOv5^ znmU+{m7oN^>bW_|1Sky?7m%36b#QIF14`TX$~c!DD+5FG)`q_xQP@0hPYRzN1f+G= zt@6qbc0>%mUT}=M_vwAHo(AsyKQCwh42JSNlGCq$xp_9V3UGvz?(RAKuX$bAMuXB%zXZO0nSbG;fO?)1cn z_KC0f)P6<3n}`}dDcHXjWrecNlu{_45MYs}py-tU0me4n_?|`NwFF6@%nU5b&ziA5 zf6FfRWkiW^u(H$Tg+G`Xf`yXbf8XJUvV?w}-nB|+GSE{&mHCBE%ss!6;=&(8Np&6O zxp48OMRzEJWKK`vUP$uWBM$G7yFiq2y5}#dWWEFv+CxaAw4J$S?5f2}7&N(usM7;AwrO6dR6;x`=uLnHKFA4jyJ7h=BITDu zJ$L@$j?enJ&N}PgNfzD>d?-6@XlgiPnFX4BKVJ{rX1-uv6ajHNMH7 zfore?3k3>#1w{f%(mMrF5+&a(!idGFad|uFA23ho+zQF5l)fQ`(Z>A}VcI--8kQLg z^!*ua$~>Cfx#1h)O;BYE?0zJJ40n&0z@0Cu)x%lUL|Q#0a% z$aAfQevziAVkks&y?lpTA=(h2?=K0Sbc&bTaHwoihSSe803dQ1{tmGEq(FFHpgv7b}R;tVSKOUB@q4l50`<;kfy3jl!q(Xq&`->0`?ZUH>_sR-DB}QFTma3mF z^hDCZp4EIw$dh`diy~3cvF8tm1=hze=+(-Rq8}()2XD!Woic-v^8-O{<0rd7Tb^W} zDKP81dMPI~*I<@HI%~Kkd=i{HH$6ki7s`m-`0qZ2Jv-a z>eM?%tg-aNZx|(PKl_UT%#kSAHd(u}+4gZVmJeXrY=wk$ar7i%5e4dKQ< z0(H;vkwyq#J%p^{0JFpFrmMJf7i&do8bn@Ds zIxv;61v#P_1MiDFpgF$hG7emeZgj4VrcUNIB&~&e2K330C#o=>M(Ix=mh6|!@R(0-!J>kYyDnQGG+J=`1QaX z>wRdywP~;EXTs_g8Tk&@q-J#r%nV|8Y*pOB6heRn%3TDO5-3#{38Riic{Gmqisbmk zGW|Z?@ljTar!7DNyJt^JJ6QwMpGmBQQQ2RyUrGBojAwh1BJy2;3aW;IlH_tqiy-LZ z!b)U`n0bUxK{`yWUvq%oPEBoc`iZL@SzBp17oY(nM7+!w8zXttCNK8_l7fCPF7O_n z7@{TcMTqF zCOAn%BC_QK!X*@ym0d0SKATY=cRT$2q~ALKhCwPWu!Vk*Y+!WHHj-TG#~liY*&#^J zNQHhkw$ZbXi*jv5PwKZANyX2^K*qvF=PN$*`_TW7 z$v}o>&*(IY0GK3gTL7P*LoaUMef9eTfIXx)shawIm6XC8yWc1rD!@ANt^5m z__4|hbw<#)9?=bXy+FKwv48qxR?2Wohap^q&$0H1XJS1-H2DHo0RE^Zf0GIgbIP+7 zRB?sAp2~RNAdU4c%J=5GAoogWcuGYpiKGFFAl$_-aA^ zW_BK&g0^XnW8 z%l22kFM9fU-?yHC_OKnyKx=2}>%=?C1Gx(q{fVDCGDu^`Nnl~ccjfSZdDJgAYznVw%xU9#lJq>q(K}~a+RoA*kQIAhp2xN+4|^}p>NB8UUcx^n zPQaVF@fkKn3m&pKHH1@urGO+EKm8;FzC2kGYS8?K@7qR_wNA|qx!Z;$mF&lOc;G#H zEwTmYA(Fe(htG`!`nC^^j09(s+zI0ZM?b{b2q(7>t)kBfI0@YC9a!yRC2#Gziu-~N zgvayN!Qa=STpnxNPLy7-Q<&ZDMXu0&)WilMx5P&p4}@*gDqHf3kC+8^Dgdc(Xhdqb zOXhB*GirqpM!&3(vhmz#G5W^VENOMrTx*ijk1s4Ko4ud(=r6|AHB*e8B&cU`5!s7# z*L%}dMwV+!C}vE3cKDy(?3SPY?0E3mlj^*G0n)ldy+{_|9q~y?8|V6Jvo`hcMJBj4 zc?pg&-u(V+s;zKva9O?*MCNl-j=ne|rQ}$fb|eb#D6P>aLd|fmWsgh`G2SffQ)7Ii zztGtDfAeSkBo>|@&PgRaRe~LYKDcJ&fA6S! zExwC&=<$tJ*fiSWq#0B!z%E?s8D zV)ZkeErXdl-y!OkNkoC$OS2WUXw#mqrWW;7kCl=6e^_iy1kY?YK=zAxcHQl>tSw*f zdL~h`#K{9X8d)UI+is9#lupm*tMzR@mA)VLKJb3n#)pa0Z% z`+Z&n(H;L?S;Gp12h+It(6Ib%qv3cBSTR$>4!V!QKf}?AJf8q1Em!b4UKaC?{z1be z^}-n(_Evm2yRF7>=v2jqobsdNAz&Gfi9v!Mg4KCsk}-n?Vgu5?eaPW3-SAy%$;hSISZPES7U_e`As;LJD^{K3m zi@R=i@gZAB*CR0stdx`Y`6IT6z_=kS!oC)Ni9pPLLjG>G1!s@*mMHEXTR)ueI^8C& zl4PJ^39TQ$3USH?>BCByl(7BU%t&fFx(F6-aW3fJ()n9VEr}RCWlE ztpNtde@`QR#Se#gqNstrKxrP(jK-dVu9w05Ie*6?mfn{&)jB5V@fL1gJvq?|8k>ET zq(?E&#MZ^Sj-v~szQ_}mnv`vUY_A&@oVPVk(m?we2)Yo&U!n7wL0&a*tTr8S0l3fC zqM2Pa5xa^IAZ=kI-(_9tIyE)i0UY4PZaUm{OP3$mQRGTIWyd$)4E0r5EdIh+DMh7Q)JlBq)MCPu)CO{zEUL|Z zB&2UJUa=>Mf3i*A(DtzZa-+r4`c?~}?Vy$n2C%AMqDypmaj)5kxug8GVQZ-ABE>K< zgrO1o^c|?ael0dx49zvlkE~kA zwnN>jJ@6LkQCB#e)4oBBMJc`_7;(-(8_18_-+CPzn6tzaII-OBTRan0+&|`WfOw-L z7P~%t{>XHL5D2Bg?8h8RUY5_hknCGT*(jBS#?CuL%3TAwXm)1CVK~96kHsgFle@r9 z{IDPVw=AjFDen$0aG{-m3w^u;toJt%Hc2FxqNqyfB`C)Q^AL2I%n}%x*8ma+va>4p zOZx~Y;2zN+ca(C5qX!))1&AVPLi4=Ks+&7xhj(G22*okNLn2=}q-*mab{r2c208}> zbYFL#=c>+5Nqbs~6*lAgyG^o_Mg(b7FdqP8yfuZfXVZC%2b+$z~`5)Hv~G zl34S%t`{zM8~R4_N^Z=B?Fj89{Q#62N%oOObochfg9}YZ;i6 zk9B6R^`cjm__3YCo)G3@dur{FD<&F@m0sMNVzNbs?7V|%%{IxSuCeaJR4%eaS;>RE zoXo}Ea^6XSno>xl`>`;{%pvv?&xc8tm1|;(6nywlh zwsr!oJLPOKojdEPW=T)PA7A7g59XuPJSnD}g{8k4c(Xuf0aaO_>$PDp+v?J4bXCq4 zIpv`XL2wnrv1$h+^qL<+i|Q*~2a*%v2Iybif@pkneUdiyF*y4Z&$0^2wHItiH+sVD zHolz?T&ZnXP{ZWwcJ(#gO3#Scu2AJpV1>K(0)65+{%xcr*#*AT-aj9WF+Hs(-lu<7 z-(e=J8S8gXS(2W~e|ldItIkVUp5^<)hYz@VJMM-q$9cwYJ?}wkZSCcG$we#3KeWx5 z%zH_j6~FcxIxh}66IkLO2*}nyb?^sc^@kP~0^?)k1;y(JL?>e$3i8f%UnXes=o*ca zZ%WL=p_r&!^^f5@b!FR9N=$7&qaGmH=b`i{{Axg`r4aHuEsss}nkFIOVnzB0$B0wu z+Ypz@x1k+|($WwdIb(^i^obi40A<9?=EBj)7weDL$yEDr@wCxf#3;yHr>R6bYm3=A z+9sXY=(8oo26a4Ozy3q=hitW)1pOyh;yybkfw~d(CyoTA>TL`y%(+eKVv=OE-sDJK z1f7W9j+be+6!Ehy+nhiOoN8J2Q#2-Y`Uv&~P~;auvbrq*l+A78&CVum~Q!t@y91U3+8+)b%0 zNqIeyLiT8|FR2Gn&HfXq@J3eQGd_nU8%pvjr*cOI)U`EBFqre&DOg){NVeT@ye zQQXN?6;g|}5F)B&yC|YW2bktm%#|-rQ&jA#XL}TpRf@^6yeR@oSDb6nQ_N@yx;sE| zX9Iks2K&18yCbQf#QZ7*(4llUycv3Y1c=z&Q&{-Jku2uj^LZa0l;UtbomI0u=8P>l zMz{#W1P`=+Zbot%jg`wGc|M zlUkJ(&*)tqb8&DxRY-gS9}jV!3gpS3pX<;^Ljx!xxtG8jJMY zETaOHnKB=IpE;(#BMF!=SEP5^=A5<#Cb|I@X+#{?+F6AdpIWLau`kLQCo&___b23R z)iFLwva6}9W6a|oV0$x$@0qrFr{W~~)V%ziT*I>{^h};^&+(T<+S|(TJJC*=TiUTR z6Vp87ksX^LRap}uvUF(-FY5v?9QaEgf!ZNw1FaMV?OT#L({i^MZ_{DX$vY1C-gHcU z`2734!_WSfN#&ks(rNjkSxoBgrl6mWQ^rD2@792g=PT0WnADt#p43)51*u`Pvk%@z zvE0ZL^UclqXE^@<;P=3s?)bQD%@(LBiQwmd9MK#nc`1^(n=XGiVSmn?&H{ zcQ%cX=pD20QA}OLk7cP<$xqZsV=S9Fr>N;myB(8=|$$N!7x{UL7*xgl=@CQ@pu0TCWTZoL2 zbcl20Ei;bQXLWn!?T|3*Uq>`MnVHg=GMP&GzcATJANIb&I+T-L7gM^G5cM8*#>fhyNOD` zGAEL9fvuCElZ)NG0g5_x>(P^ zW!u`t!=1~10>QLRQaTAJnB5RAmEuIQKYPfwY~~c>nTt*AjRzadJ3Bnd+sAJhW`+kP z-mx#x=xI7E|>uj9DQg<26&hEAj3D|k7X7J zw$3bI@AVrX#_k3vl|7m}=e9DAYOmM1C_bit*ovkZCi}rqny@k~YCg^J89F5h%i~-I zO)jAoGNUj2fF6y(Q&PrcZf!1J82?m8^{@j^K~X%TW!cBHSf_Eq3=*v^otApqF;PYL zyV=1<8^;76hWp&Ua@@$0m^mbAm!pq-0PH^tj{s3UqIwoUmZ7Wzh&*I}-_o@OX!grS zU&?QdH02;^l z6B1s3D^5HM*#nl196(0`md1fgq4=BTF?qiGW5=XE(24A35{dhPzvXV6&K9au?&4HN{mMe%vd2=ui_ze+Ur z1NhDUE{1bEySaT52+b%XT^*8pD_8-MQLe(MD_|?n-F;%Maf|WXmxX?x8Dg%p#uJ<; z9$d7MRBKSNajItY!XqY%!tWR(y1haAjN{F1Qty_ka&MT)gwtqNK1HDhXwM=zFKl!PZe)2br+t)wc#=0K)vKX*@(!0NR z?ryqC#)!Y&>s4Y9YLA8i;~&sqRz` z?%rBrM!T2%yysoq)G*KV3CZr}egq5VvvDfAH^!sC@Oa~fZ|L!lF$V;h$81&B)RwTo z$w#-ZJ1_<}7WIek%^3iLlP6#y&H-^Ox3_K%i$&p%>{AXN9m?;{@f8BWrxVfpT}5|J z+Si`dRSoYAo3CUPIsl<#`PGatn}uQ`z%9GYsXt}oU(9Y3vCg`QgyH`1RD%R#A5^MD)br;;B+`sFoB@(SSW z&b2&mQvZvFI#vv#E}h78AtjW5-*3f}osQU3A;5zkxgqUEK#eQyvffAnNq^4ZqJ*pN z%D^!tCold<$>-JNpI61Kw_Pl%>)kUH#I{GEg`ltZ3ip?qPfv;JJtO&MmK}5s#vM_I zkd(61=q3!=W_rA3Q}o^96x5p_T!%jd{E#%V-=zgdj317VQ?`S%Gw85J1u(9~9jcnM z(k$|8VQs{9zy%J>y#tW`bGGq; z0;RV@z=1Mu^u2M)Mx-ZUnnCQz?Any!co}AlW|Iw6_CQ+~HF`t&y7rRYJ51fVMe~S5X1w6LxI6 zUqUk#ZM?qKDq;2(qoQJb>yxz^wRyK)>RLR$S*|#^Y)w;H`yJru4Lj^P*916bLZ0g? zPxK}vf(U4wyQJ`oQre(IqAwP-V>Tzrh7G=UOcvr!GBJ;Hc1NH3k5hePexT*TRVN`D zW+DcbLaZ%aFlrfS1*k_rZ+?AAvqUS%_~h*Kc&3}@$(O(|)=*I)e!a(h8;AH9r;|vP zd3;zBTUw*{>3n}kJsCWfWL>zecHBUbTJIsv{LSp1myd*K{Gx0NVSVo`qBIj&-Buzi znA6G3#8aarPsxsOoBE@5-Uw=+3r6#Z4;rtT>F4k7d*HtkHwSDNa`s~5$b!6;q;EW7 z;{cSNWj)ycrfK}238wzj-gBQ2PmD+{;UTqYg{e;NHwKp`6h}HIe*QbQOmmJ_y0p0R zhnH+5Yek~<w@j)}+j_Hy6b9a4t-S5e(X-pYRAEaFi zZ)(d(^M}O%(YAHSWz5YdGxS()Q=v9H`rR|%#XaouNf%?h#_9b$V{uvE6dcHm33y%;l>POD+vFmSpCb! zy?-CG&z6?);#p;oNC=C+vmRrtg7E!Lb7pLkjWBQ0Yx@rEAZ+=0UTVXQoWhHbj-S4W zKck@t1>f`hM*j|J2W~S|@*ad&G&&*|e65Qp8|Bx_vQ5iMuZr=*71V z?}T?p(&V(!(!7ZEYr=lR;Li~gEsG$7W4MvG-MWj4;^&t4$}hgg#EP+bjlZ-onxS8t?eJFD=kUhnws(06mZu1^S<#&mr%_g7SAElZ3bTomvst zi&L!VbF%$WekRV}^$Yj>>KEobe7$|!_C8Hq@R`m!+K=B=_|6x8E;@GZC6^Zi{;2^^ zC_MlX|H}qE93@kg0PP_M=dD(dohzW_S=`Cd=>OnI;A}K!W1LDR-8Bn>$y1Q!>FzWW z^PY9xF?L%T@Lg($lmbjtR5)D97SYaq{6TxeJ41pw#wTP>n}2{#9%AEl4gQoo1*a65 zUqyC3lHlj`EuYQC$1Ex5mc?iRxX~M6DKAZ;Z-G%DtX(bj4r&wBn%9;;v6h?Oz+F-b zSvjuLwsSQY5%4THJ3R%xZak*`5Yyn>h~gmNlrJOLGN6Va&T5iG+$#HjY?DFNIjDdd zS9S{!tEP~>++~p(n|!9?u|`RaUN(krwgn@TN3NZ)2J?UN#%(#bkGH6?E3jLth_Do8 z3ZxY4r?r&I`RBOsa6>vK z#MebJDZx36DkMK^d8co5v`KddJR5Zn-hv>2j0j-F$zG)FeUo+aK|GSGd3ReB?RaReo}4 zG~HxtgJoza7t;!o+HTVHG1FOheGBMc3oY2O%2eDXiF}?L-2lDPcbBrR*g>(}(wj$zt$v(ZPm+o~Ju53)!Xcd+!%@gU`i|nA#mq%$0f@ zk`jQ7oU_p;COR8zI^zgYhTrsK_S%1zSYmdpL#=ylTa_L`>rDwsf!?gubDWQ+*f`($ zRvUVa-SNg}%CGjcK6MnHIk37?Zv76UA#MRNjs>2V_UX5zIq2}f<0Z7Y4-$c$LEXUgC1+IVRATM0m~5Z>TQdBo+}sAC6MF6Q9w`GUy!q zVwJtS-|Z#R!G^*muG41S)$`0Wl}8)?nbirPqWbHfu9Q6?>@$>Zj`4sP=oI5mKA zVh-!!^?Fu>z(O`0(pq?{5i~8yH)lvMmu7APzBAdd&w>0APoc90jc8bQhSOgJ+5;m; zp5yC*KCXx!w5$pVu$*b{{e~f1hezmv#61BtY~$4DPHq7u%AdANo4-3^T|(FY7Yq4+ zh(pdvt5yk`q=kjnLQ~hP0~E?#D{$^Tcr?aA^4ZtZ(*EJP+iH%i-oD0y(p>cMHBUfJ zbAPmE(K?vub>kr^P$rYIpTQLqk3;0||7dB0fOMV=70!RJ8rJ9?Dv!*)ofkvAgwXZM z-bzl@@yycIx7PQri~8a~R`#VDZ!zfZWKMc2X2D1VYBGu4ZY_b}Lo#*9=MENiLcYh3 zIb==nm_96)<=r*@{{-HF?$v3)X0=U#Hk9AkDE@0KONR`*NYe`?}<$bz=>REUE ze4)u}pQ9v$&KV3!E8O7(h+Ru6pt4G!*xgTfb=C?Ku8b0CAL4d`nix(|? znofM*i+s(*=9LNcIyvGlR?e@%I@2Bbh+V5osOaD>rG$%<>^pqClb)OTBQ@n<$vias z$75`)x9268wJj}>QUjYgVlJ*d=XL(Mn}!v!tKG;fdFow}@;8C~-4eKzKit|dyvcQa z81{pMy`(9XoSqKD+U0vsHhV!ik2}Q|D}au4N;v)Y)_mWyht@#7LFV)406W1CDN=Fv z6z71FC?~)~_&sOeFL?NTBILM_@4jC0s6s#0d8y$tNQ-rybLR&8G^o#o(f2akl*Y!y zV`do7Ko3$8r39lnH&r)sm?~<5M3!#(lF{~D2U=_2%|vSptd6vLLy=sFW(Q(74sSZN zcEYK&OQ_Kl*U4TyCHNViH>t;X>3BD!WI3Y#0i4Dd6=_O9vdi)k`VgZ1r}-MT{{g*u z_1Z9uFt>^#*;z8>Qk%M7NP9!1HNLZ17a6b~4fv+sx+$yAmP# z##NOzWt$ij3IEG2{rAseYE5<2PrfxS54KI~jHz$8->@WT2Js!-*P`t+wh<1PUb)Q< z%62-=DcUuw@*bkYm8Qkf5tvd4i-I{NyHF33#6AP~;SbWzStc?-W63)ph?Qbu$}BBS z*j1z9jGs;p*wQ}Qd89ZtbIM3-Fuq|VU0ADKg14ZW2Y$F~OEnQlbZ7P5hLFt(r*+cH zN9+fYLn*au=N>oeq{-G4l$E!)p&*pmis%CkvqDzVxI0*+Y=p08{r%?H^#M%UFE@ z`O^`&#YYk0YI{rZxEhTH4fboT~$OYNSM2)v*6F%0vFS_uA% zjR`Wjv>31EG;KrE)bA&TIvjCptX7qJ*Bcw+$3ztPdXSs}z97+>48NbK9d9E0eTV%s zl8K?q5}|Mq;A<<3?V&y>XW0y<^djQ*t+jSEr!lh@lrG|5+csTdKbCSln)DnefTYrs z)RJYn9ku`sp=sMM&k{~K#68_H7RJv=2K=SHz)LB3wsd&o)QaP&KN-7%oN>D=A$yEy z)11ADgb#1`Rl>`naco2LtjNVRD+%~jdNs0qmS9lRkvDq~O*e92y?J|#U=BC@U_})r z?p)(A{h;Hn1rpeEJgrY)QEb`9VA;YlYR;V2gz{H`JjPOJDpZ4@mTxA&7{J} z3^u-g(M?|;iv}T|K3o($9E~8&$7X9vU>k{g%a&bO;JBe{?U2%-3TA$L$XuKw{ul;a? zeay+iT{ldAus`yqeN~0nTG5`|J!U3VwZ5ufVU-`?t102QQQV>r7dDnhEZ^UCt&UV^ z^dlF2KrbXF{#^KQYripw0r}Rtrsz$?cpkd90Vt|8oF88nRoHbwWx%TQ5(A+W7^kdn0D&R8J09>56orJ;8ptW zdgY&7Z)JgQ(CN347+n^uH*iN{IFCe{gh`)M?nlL@HFfJwvD~fse(mtVDJv^HqjB<5 zr@I?&5eW*ZeVAu;%0S^znBn(*>U9ig-W~R1gE4thk9&`}!v(8UgAq-xt`$5lNp`LT zA@LdXp%=z&^HxeXG_bKrdaFx;QLfc-aZ&&@Rt?Zu1GPJnjg;>x!5tyuoEwGwo;yda zcG3rYf{d7=$UWnhi#Z$Q>srvs)BUs3{p5Vuv-IAPBCik8P!7Ob2;hQaU|xTcFc(S* z6~9#uZwPRqy}2GT={$qoYT5PCUADuE#>|zHc56P;K)Gh{*r4yBMqq0F_xE3u9{%*( z*#uz&wV&ArS=&yuFzRl2hjxJGAgWSS{^cjFPj9OwKg+C@tOe~H@ej~bj(hIsF9v3p8)*9J+2AtHy(%9b9-XY~;L%*5oPh3E*hehM9j>GmdF)VRs~O{5lemF_<%`37 z|4*$KPws2o4Z&~~IMQk4^*`)$v0a2xOnxX_Vlto=*J=@QxuuYuN<*Bz>CNspgYKil zNNi5N6R$h^;@~*akn_XDi&S6l&i<+{(_<`z?axb+;vb=X2PlqDK-(XICa}zg?YoXZ zW6XHR=mfz^R7J~Ih~9m=2uGB}ib~~=`_kS((ryQzTR1@MHSKaOL3aAU&L=z`C1|$L zqgNOwj4C$7^u)jb%cl+_rh0}SL7LPW=iY@tbo5Yw1Sx&5K=DgD({e)L(3-jv-6u!KcVwa ziph>viQ~rt70)T9pST4veJGwTY}}D8_13x z?b!739EES$PFZ3U6wA@di7z$?u^V$cJWl>Oi{Byt!Xyej|K$36-wW0vkq+=JuLtur z+TNz>Av4BjZ(h;!S%Xs5hBsf~Izet20=n-b+(fhaxrS0LL^R`Z!o^1n}fvYEeXru zq;D3dE8foz=5jW=Czg;9TIMtwDrCW0#S-V;_fqrMi2L6U2> zP#p`vXbcz_Wmj#V5EG#qv1S6V`<3i{Y{Kw4kDgPiZ2e-f0^3k0bDWh#lk^&9%#;LPIyY_f8cT=-JrsZy<)Q4Ffwt;~u)brokUD z0XJHp4=MNfQH2wxC+)!h>_ycfHCH-~v4_r3sYWF*75}S*4KUiNAcl)nA99tY8UoG> z9AhJozS@%2`nIGyqPdSn0}es0uCSZGVFY<2fY@F638k@A$4<-hbzbP ze1hq=xLW~+pbRT278;|vZ!5KP!-wVk56c_VfgA2!g9hix1Y=8AQwRK%#wF_Z?ZsKwY+ z34_lMRCv0Dcrwt-cxDb=uWgS)rkXuk=r0b~dkE!T$EKyIrm8vWA_Q9?1huw{LVzAM zU0o29du78DTZ+@3OI09es?+{p@b~if8C^K15!unYwQ4CP9Dqy@r>DQ5e6$zn#KUyf zDs72`OpT1>Po)FW(m$W$s^}1L{2@`-ua7u;fre_vMKt5JFuRQfdjG&UXqE*}&Iy3#4m8o_NPS+0MI2Yo+3&aAhcqR^A#kNOD zANE_dLEQ6f_Bl{a`Ad$9>RtipC7rV z2N&pg*TtwxV~U%P2{hf=RM8_W=R&}T8=25~*WWNqAy#V2^IrM;{qs6ur495}9?>B? zr7J_*YsjbdM*Sb!-s@(OTcO2*_hETCfBH8#ZhJdq`%~)hndC^vh4sP|&2R4qQD7Po0j>qxAva50WEkCz`g(p$3=Vd!%zSYx2?e6_gc9eturmr4Gl;tq!e(u2^y=h zeaQFUM5jG{%~cq8&tXqo+yZ653NM`e69az9g5y%dZ!dHNdwE*j)+=Lo3wtU!hLB%8 zeKYmywE6W6oJG@~kE^ z6Xt5|UoJ<0T8lVYckV;`ts3@9Aej+)Eri$T-fBWE0?=+iZ;n@(D_QgpP6C6%r?FX+ zM$cnjNf+?)fKb>}M<9F@IlLqEM)d}(LOJV&PtG$M*H$;)vz-1y@5b0P?^dmILXDit z{(A_^eUX+~P!cEdMEJ+I59Y}W7-4sgqr`oYaVm8leWdEYH0NM3Kwmz6hwJR!F$X*D z()Nn#`s78X(@)Cdu^-Ir=I6uC(E!LA(3Gc(Ak?`v1VIbXnB#+Bj9TV=H_k>&h4DDj zTEV#f5VJ#aTT_~O52_;%w#zl*PwE-&pdHG2F{cg>O$i z^0pn%N&P!(feV)>OM`7DPxL>L33y8*`bmL!^?6aCB zmBv0DeYkVdxc_1a2?X;r;h<*)Cq=Zgme_TT#qki+eZ~+M^>DA1{_JR|VP6Co1BR8d z0dDo?xQ*bDv2o&Q1Ier>j)Xv+{4Sc3R*s`(iKOY_p2N8`Fu}&)ZHizJGD-AEFe8khR9s&K?*z0}* zbd+0;&#tF}{Jj+STiWYsVuOH@eVKzvi`+*ca z8hu8_hCzsdaYSmUEv~q)vb_Y`RPPWB=UEus4(alrX&vX@`TSA^)_1fAQACI|d+-L7 zz8|g?Go{U}cNL=@*EzMRo=Q81TCjeu0zNj$h*`WVFF5kV4HD>Lf}6qoi@)?%N|{ldMS( z*zxk7#*!#8We8aO=-O3Q>7hTZ^LaP@PHW<>H;Ml2tYYXR8S(7m!Y$6nJ31?8pN~z) zvVvFwF=bM(sT3)^|9oUX-JBExST%&E{-?DuT>akvqi4ncs5M5*^Fu=G zAM?Edb9~L*oz08NO(du6?ie1+O&Ytu+4-&pZ~6Nj(>Uv(Wb3SQ8B@HhBzMxFT)Y>g z%^X_ly=0OsCuG^y79}s1Tfpn=3N|AC;j#KZTHeq(25#3k=#OAvU{GL&aVBWI_V^^j zAp&e>>Z(@&`1b- zJOI*Ei|={8t{AL+J)p7g?>y3mSuJkxbztTyJ&5;fK3A99UgNS3`fU0_&@n4!IFhjt zkHVVh@eq{>g|sl6zIJnT?on=Pg|)H@{??X0Nm~0?Rb$QWY@U!z>WB(IMo40Um=cg~ z2DpwUWf}vi@c-xz{!hJtSHJroUI?y{wHma`&e9iZ6jBvBk;7q2`cg*tahp16VVqKz z^ugr=JA;1{GvP7TJzol&);mAXGBz$IlkhNKPgfk|<_oE$vCCFEZ%C7Bn+N_Sr?~q2 zzm*BE&c(mwT#&9so0@qKt7!+YnKK*-$UJ4600-JqLZqmQC)v^pV$#AAC=?Ie^pz$| znPr>;QJc$-1*TDTeL|XS%@H^ByVa|#b%BKRS9A4WbKk3L^KZE}SLgG;D3S8%wuVPt zM2d$!!V>dSm)vCMScW@LLV)a%@hvNN50qp4i*A(qV`DTcDzA@J+ip7JQVfOhYVG_l z=kV&jyt*&1FmQ!|D-2v=;0gm*7`Vc~6$Y*_aD{;@3|wL03IkUdxWd2{2Cgu0g@G#! zTw&k}16LTh!oU>;^sJ*K*2K~85SnbPGuNS`*k>TiNydDYF1 z!4iWbgHsGUH~lUud(RI&$z;u8x0;zmXnLgKR!u7B%6H^TZ_bA++6f*m>9(eDLswn? zsCfos0(Y#vQ6XO`CgI5|j&9#NOl9#@&Ue(U7n?1@DJpr6|J6s7(MxJuo4eN@&{1?< zGSROz8rex)DOavJb!Vwh{$PY0CGT# zv#S;ps=Ov#$}LtNBUqXe=rq5q_J)QZ(s@Yu;a8Lir0Nfix`lgLvZl3doMg5Vxn44Q zBWjMnjF$x`SMC^`=I?2eg92nfhU!D6vQ!Pay=i8Yg*Bb?%QNfD^FTqhkyA0n@2S1j z@f;acv+QBah0gQj6ewtLrRFGKs?N!q03kVRmL8@gR#mZ~$K z&GIgpSDnteQtpb4qbW4hacC^q(a8O%2Rb+p6B`=YxXRlsagwB`#!&P)!3jH>hRBaJ zx#%i+Kl~tQaVGkPCN4S2xf98aG5JN5wC=8Mx@Jm_JttsKCL8x7uadDaQk1LRj7JJi z7fmZR9D(qy7pPj}oH4RLs7ztiUAcWw!Q3KL>tcQ8{QciB4FR!dGYveb#wZ@XEBApc zWun`hAUJxeYp;1~3zZNbJHXp{BNSBIdk@|l508}kLEF;)0#B5rODk0b3ox6p)V;|M{VE<4FG{gl+yRhv5vLnlMi`J zI(bl0vBJ707USc=06NH8Pub;{4(;EJZ(i>pTp0)34K-5&*5zf20-5df*i5w^m17Ky zO9v-9)1MW3kwjo=>t(=BM$Wdp!m|Fq{x8~dDS4&ab zh}2@2HmO1lWE?io_;CJB>*ezk%8Hte5E#GP^Ao&SJR~ zuTyuU&r_$QnQTNnF9$wb#2=~=+NDD=1tiaraHK)fK5_lhYe2`-Gzfyu17e@S_u!ystHvkZI=o@k)Q}L`W|C-K|7QjFOu_ zN%zDNP$#20t1(N^IlFs9yN{_f6*v{Zjp#;HsfK$w+w<^-Hfy8T*&c1r9fn3RclKik z`m@JDbZ=q13+suZLjrP2Egv@TjFC`?DOTpqalxR3m9NdIt#a_~J=xv1aFth0nz zWN9EKw8qJNIWBf9+dwlYO}{Oko^Z^93JgP-;i zt+CMfHJ7A@HWR}%R51@-Qj4?KJy^9Fm~zN(<~d9_?Uaf!{JkJO&yE9v=#uZZ)f-q0 z!3`71W7rNoiUqRB7B2f}gKVn7mz~!K!z$z>;9R<;5~ zv2b}|iMErg8k|Z0Sd&R~T%q@r`wBnJC^v6dkwS{vl{aSiRq;Pemu14b9jH9IBOk7G zL`&-!iWd_yihnu@48ZVBo7PYDQ0x0Jq&36p`Tn>)Xhja1ciq;Ei2U$;=KSo_PCe^o zxZR2XcNi$&LZ(E92J;g*Q;msnKS*+4;rQ+9Aqc2ate!vL6sRaD($pR%@!9@o%n-z; zE5st`)%CSh4Cab@!+yZZvSd!0Q7g{Qs$cG99wfjP11Ne$_mU3VMVe*UI22%n6IGwZyrY$=1HgU7Cc=^O%x|wVn<%52ZbJ$s_=GpDp7^DWFU@dZE0t@ zA;O69DGjG;om3uxrK2=$YFE^K_NnbyLvUK@1sDqQub)Hrfz%b#JSxU{USu;DJff9! zhf$TwYwSFGQWRQvomAIysN?4EAjo$obBK?IJ=LkyUZTZVZT=uQbjlzcgD8GUy}BrO zn()?6o!A;QAXP%_2?lOqhg!%19Xoh!i`G|4eU0mAm#1ng27tOVyXvo8Xj+JJ0%AOB z;Xvst>2=Z<5|4uL6`Mi}3{C?dH$YZ5$Q_RJv1R9DE3wpNrEr{}`tl}8#=$7dDU|g= zzqYGFT~*EHS-@lo3moZctY%%Z1E;cpEXYoLrkuwy5O+IDXR@ayqk5&XLp*US zw3H46=+B={Pe6_TLx=TBbjqeNjoZD0}Su>h+kMgA5+o<$e0LHjlc+)Ng$? zb*KfZt{6d1%tQO7X*&-7DE9l6!d^|O6HJJPMRZ09?C8x>AF-8Yxi4-T+PYtkeDE%9 z(rj$?e*E5NyZ|JUgBXEu4(#vnAZQ$SFdsE|i1k`}(z315o*A^p7@1EQG4hvN0Fh=1 zt1n5*N2p$N?lRPMancx7@cqE=a7HXM57 zchu);Sg;AFdM5WpWaRRGdw>qSt>E5CZ*u^0xBZ&KBuGJzd`2cJe#MyWfUdR;Rplhv zR`+`c>5LK+;QrWW_b;=Nzsy$tce9nYlkLL5y?JGe6)ocn1DSYXu93iO@R!~@{BJrD zB2N1D%)QDgge^?vqO4h4#C>gFuVAf_AZG8r=QY*D?1|~#{fs%^yu*)0U9aqmflF>o zeL{H$u>(l8zJq)#?I66=iKrNwS<}R*&@8*ejrM7e*@kx*+#(IRNK$EAK2-Pwyn3Zx zNY`padCiyVk@NKJ<)7M&WK2*ap(Qag@obF)RdA{?70ygMpW?zH{px1jNjPrI@Mb!Ii?HW(~1lt_TDK zp@X`t%%0U2-=4xrY^mH=F#-4$6^%s2vsFb*=KD*soTey*>?|ltciigWo#kYH7y#vyQyba~wIis_ZJe4m_ z?lorbnA|+C{t|uCSfb@4a&z#+uaofJWxzBuifRlz1i^X3tR((m_{WDU9_d2LKB-e( z7*7#u%MM8*U(kn2bpkd%InA=EDRv+poNvK-GshfhvG}>G=IIPiQ>!?;u+5AonS=N@gB+XHr3oqA0Pp$}P4feywCH#ZcBbj4F6l#EJFx2Jv>tP2$96Hu?IwPB zV?yJFlHho%6}m-i>jdW?jis2rbfK?Fl!a%K|c_ zw6nxL6BS`5o4Oy$%DK0`#6b28yL)=Ne7FbFgej{&Z<<~cH4{pGm2mh*uXJXQF;5U&L3ZOzL$3VvX6_% zz(W!}412r23cP)&nB74Qc+Bl;&{&A}nZ~Vt5ntgnQ@307i2j5(&{~P=<|ub1`uiSN zmCdX-?CaE-lm04xf!L1pWnoAekgu`6mpt|dmq!+5>1LQhTg4hJ|bhIwSBcAov?SNTI-XeW5SJ(Kp70|hDvq9lvK$0AJru)%sP zsQj&+)`Lfvb+VXm30#?X@N6iB?%xI^O_%TxY|MGkcgFGiJ;WovTT*-8O{Q9NVq;Gy zxhll;sksnFe%t*qI}E5bnQpXUjPx*Q186^TW*wu;@J23jx$MNFXln3xeT}_^#+x&P z&hRp-zZ`EU{#aAO4Et$S89F7FCodMvFm~CrAq;LvZ4dk^))3jML>eixL0f^T_#Szh z&Zj3gx9cs|?%IL&sFHQ47P~eydyLNbb2>Gu74gHi96YnavZcK~;CV(dGHL^JVC$S3eBk4?W*LiAGHso5s7j`b z=`z_@wJ8Q@8o2^{d5cWQI;qrSB2rEKD$QlrFKK0XtBr2&EguZVm8U^g!A zxm0h7#n&EL7sN<`TlkoLco6$cEFN9cZK)Sa8Oopr7tI?xXe4gK!Yc4euSNAM!#4@O z;J3TZw+Pl*Eg;?H?lWyeGEq}Fh7nUzv;ax$(17-jgP1PWfzB;lZmlU=r-+3nw8VZ` zj8&*7HE?pgkO+UdEq-8pJMvpo<|};Zuh%RV%*tCS8Pzqo*Pxm)Xa#J;eIN|xJ?EKD zY?e9OGhU2!!7?U%d&@5&!cjw#YhSFk|Y@;#024V%#&|8Uf*U}qp(&03aufF z%Ki`zB$IU#(cr~AIR0^5fsaTTe2>2i&Cqga$t z;#X-}ctbLO{vm0XW6ISp-{P&a&F)vMV}Akw0YyffHWECU1Ad&kQ5)AX&T`L3w(lUh zJ4J6$?;yimHSZu5v+p2>0~MuxE4YO}Zq;ww1x9EKD8BarH23s+p^tOod!FG7LvG#b z);LD2cTDv68}>($P32#ozc~1R)PRoj6>^tcnHF1N?vtfQ zm-|%p3Rzbd!M&CJq{giVbuN5Zad7b&WxI5#La^EO*kzH?)oU8q0PFlb$O?2)oBts` z^ew+6;F7-gWv4`>#O(*l*UpqG?hAMPB2%7+g9?cgoq7@1`3Wk?FD#q6-YXhuRim4E zX?c6r6(3F%?$B<387Td~HcpM9aJJ|ej;UD35IRgu0!@sp7 zVXbd!%)|!E#PCP=A5HyP@e%(m{Nr>HHzzSACw(X5KRf6MM$oc`Xd;NOoze?LmVYX1S~?+pA2_zwd9xIblgJLA9K zF1?bofzzMiAC(NS^n&`1#(xq)FQX!=qM=SLW@~F|Wvrxc<4CJ$Z0c;K@9=@z#?aQt z+{W~yDs?PwHp=-_+0W2dn=PU;l*e-;@~Z|0I6ae-%F? z^M9O~GD@`aVzy3Z=7#^qXtMoJ!e{$e;WM)SQS%=WCi|g#AzLe32N7#q3-f;m+5e<` zwtrRr-=?;IAQLsW`gk&YOte-X^SiL|pOYWGw6Tq;lNl@%+uwxyku`sRRJo-EtB%?L z7!cb|sGY13{Oj|Csr(4D#1*fZoYiuZ&lbWh7D`` zsTW(9EOH0HHE36>*B7fBX9u6-(2sDk(7Wp;$sTxy;FEkqWSO%1M91-LH9)_=c>|xy zd#c58ad0D4Q;e5Z%PXz!9)w3sO z&sp6$*5_k%sf3bahVd&>(n7C~M#B^CVE4Bu?oy6K2qLN(X((chfC^WT)o(J|B@5J? z7Wl77CK}^}Bj5Xpb4x&&1i*?Q!=+PY3z(!!e<~f#+?6Ct4-z`n(asREz|McM(g^<9 zNN5pML=sxWiioAvt5k1M^+>%*9AY|^E?HgPT@=}qbVmi(GTxKzc^s|5V{wfb)smY? z$0fq34WFP2hsuLVCEp@h8Vs|hRECy=@pE1f>{T8L29T{mP$!cA5e@%Hme!ypAr$Yo z$VNh%HLsT+rp-s9o&PAzwVBk*2Sd)0$~(oX!kOD(9vz0w3u^30jfoe|3$k)y-xI2& zm^EAX2!_>s=ccu3ll+u%2~R9dQJ3-cUJxrQ-$nrjGN>K}B1l~VFMuYt*iZWty&#xK z>rh`)NgWZ%IRfSdg8gg}Kz5jF9kPr|y?J!JiU5^5l43xN4J#fMjKZV@bNR>Stidso z4$kqf!|dGAacp^s!^$_;;H2dSdztyvy%&U9@7}%z(xpV5W!L0AYUXrH<%J$C7lMA_ z%K9eFU*(l!o_zEG*NwtvrbWJ__lMEi-RI;g`dO6quRxbA=)P;GLug;`^~gc{>g`N^ zM!~?X|7n@^&(p>Kt^C>lYl{9a`JZ{hxoOV-I$u9Hq*YI}8jTrhAi zZdQ`y{+ymDkR8Jid*1oLS!0acl>SoxkZOB=x4`w%zPMsp-oCIR#*`g|$jd1qZ19`k zBfoaY3`kw)g>zZo9{PaWwmr?wFANif6z4hLF!g&2iS3KEMPGFGI4bO3H5R9-}+93$q7M?*ir2!#{Hy`*>$*ubW-J%49IT)p& zXzX|QH0i4q7CpF;**0I8de9k}3St=}VN7&F&0!ZlSL|oeVUtMQ?-{Iq_cCUw1lWwJ zd+}yxY-B3#v~_ybmg+?tNOqET6q zB8TsvpEX*HSZ*b}JwH3w8x89TPk&dFicjgtOJpZwe@Pp@6rN+PAzkV0K6mIpjibht zEm?j2ypM)LdC2(Xv{Y5GS~y%)+@TL|fVs zyO>lOXxS3Vv=+xKF@TPap7`TX)N`pr*!|nu!yASRQhZ#icbruCdG``2E@N|n$%{Ob zvE*FyS8A?F^&y)spz0BhiXfide!{(wx!7K+buc%g zf+YRdsie0Q3Qn%$y-3oo!+^oT3_?=PbR3-Jr#x4Q98@A1 z{AN=-^n(XS0Yl92Elu+Llc{LwR6#sIb|kknVVdjq`}av9;V$!o^lSVkh~Xgk<0`r` zgw!A$&>B6Nhnzja$ay7)%&>j=7o>&Np47?dc&X>A*KZv&XvVt|kw(_!NdN&(?8JJ+ z%VF%8vQvT@T93>yJ@2=^A3FDHYJg6X$frEi`ivif94MLtSi(N{v_cA|*Xf6|IW1^semq6JrziSu)f&a>|UP>q(@zQIwnz#-+e(tsuU@ z@(G!xWV%d{xlTUIJ83|nK_}ylRZr*HNg%FCDeo1m-DHpw2ea>@j6d>PVa|CJIG&~YQA)H%JjCi`eI4mJENsYdbg6KoQF=Hmu!IiDrGbl zx)v#L!uB%IxbwZ{j2j)Dog=A-YH}5bQ)+9sl}AsdJre?7U55EO(Qo8t zfrs^y&v^j$7=9L@7LODTXsG5?0^~##K$?F8lk45;=d=Qxr^PIz8{VW1CL_#lLo@cB02P%GP+QuF$eq-83~=Z>)s0WBghpgJ*W)Ddf?`Nqk0U2q#Oh(!ngc#~b2#IM36fI89Bb!*8~0xZ^E$C)ypQo0GF~lOCAASjJe{ z*dsZ9;%2(fLu2#3Sj8KLd`UAJRNm*yRe(GmGGdW9jfj{rhzk&Y0D&@ems8xqG{EEw z^`2S9gNfa8oH*NZgchJX_@X(!PTy>RLDa-a-_5ZUx^~Lg31cliu~}EVw$R@u^05v$ zo%<|ZHp0x$3n#W&#{r$DHRm{GG!R6n8lg|Wx)6D76VJkVwkbQ3vnNkt>VH4_>EGK6Ylm1BxIY< z_?pZEPrEwBBz8D5WtnjFE7sz1u#exvRq-GsE1qhKz}0fuMKuA$7?F1JD`)dHlsptw z;Y=?(Q>n7@Ij@&o4Y(6iVAzzNtbrbX(%U`<&t+L30h-~VGtSr4?j)TAbd%GWstKuT z!EO@Y$W!LN#-D3eoA|Vazwd@M6)R2JdfK69H%OQcv_|k}F&jox6O@Y6T6ynx-;!;= zW#lW+G$Es*CVKh=459<f zNaTGR!GjQ!!jgt9qD#v2gcNjEq#nIhJc$#iueu+@P%i8a+NtzzJ<`hEW|U+Ad!eI<)wpi8Q_YJI(kIqBdz1@&hrG< zJ%`GibqBFm#TTiA6m$1$rXt5Kzp*23{NZ=dwTjay1l(4P{B8jk5&HE;3XS}XB>nnM zOwk7I5|qldZgw55S7ux5c|+2$rWk2+7gQ-iwy;0ELc|oW0&RFUtz9y+53t$Kif4=h z=r+fZb;rqFC!vX!4)}EX#l);0d#RnyF~6nvL3T}+4PTY2S*m5A|9fm&VueW66gg@7 z1TI6E+3I&JO|2kq?S!Q@b1o`=b@Xyt<5gK0mfvSCSf{aNoR);)q?Ju`$iAf`tCbeZmXqbECJ_h* zJbKBvTfe0lk*dQch5aQnI3LTB$h~;eiOJcj8VWw z6%|vWl;{0YM%!gUn%^gt(o3g8d6zJ-aghzpGq*#iYP(t@b*OQtOpoZPHL&7HvUvk~ zGZtxEkj=O;ul(JNddke2XI7kUWo`z^>fViHII&X^r?Adg&9bV(SZL5o^H@caT4>Ms z8|72f2wQ0F+KtNARe@QN`&hw9giGBd%WOod6e?)O86<}l(kjIX(csz3uTnRhr zuc5i5tbwtZvZKMMP-1Gtm{`h_q30+il~r6N4p)YX6;+;9#W00J1*Ttj7vboryUOp= zQeEm%VR{-eJ&iqJv$VeE_M$p^-im=U&oclSNEgV+Kb3c?3CmjDE*a+xJQB)`66N9aPo4>1-&W}Yv@tN&h5!gx4DW_n;2J~w)1xVdLC`&3^Sr1cG54NW|E zCjt&mceVEE)j3_U&2o2OJk=HlGcA6BQ>A!gx~QeV=dCtz?csF8ck+BeKuso{CXvha zXg_2;zrURvm_+%4x}6piT_s|vjNFd6coMZ#6>GZ`{;C1K01}TZYtTidzFa|kYey2- zf|@G($_uY!RRCTbx;sB~@MzBF4Jq}*>k}WU64B`;L)yvA!4WdXFR3zp?+Pv(M5N{# zFg6x6k2b=Fc>;OH^rjQDE4MxW=&0B!tr+7Ilf`LT@7qSolOf~m&v!}sq|C*z=-5ss z+JbJC;?Tm+IpPY4&K4EmG8`LXNT9k($c3Z=;B1oCaYgn1G^l61Fd@IrU+%s(q^BO% zm&_p-3sGTl-o;=>J(_OHGh57q%|h18W+z@jw4h{L7elzXdI;jt@7qV{!C}{tm8QUc zqRyN$g16(Vf#3@v`C1H9#(AG}+;6BvH9rbYU#hJ1{gAo`#_jox89gaz+*HweM7FZ8 zgp-87Y;n)5%1qWop@9tE?YA9;t}Xvsp`u7m@xxZwd`G=_JbG&IkO9>X@exhAl+mo9g?@2~Q zO+-X(o;-L^bHAl!Uqf+nx|AoKPOXsK+5ZxkZ`d^%Awmmj0=3OPkT|1I3ZpM5%L>VD z6|2}_UbLRWn$WgLygPZhf`O^pQP*U!U0bF-^KgBITU(pn)^m1Pq1ATl-21wtkbjQn zlzX=+``qIKwf*|MaJ{wdMYdnAskdQurfEx$ZhV}SQ?vjFa}osAc!BEYICME1eoIRT zW~KA#SqWg3z{_?kwl`L{dC@ZobB1Nj22qJ$VOsKXPa9PmZy;%dS%xlDThX7MZwBxF zxz0Y5PoFvh&=*t@r+UsfOU2yYE4^V{06kVWDP3=3vGDaA%QPD2)7WDI_=qe8f6h2w zrE73tcsbZ)csL4ZN0sRqm;T!H8ZR%4dZT%Rb9Sqz*ih4QQ#CKk0^ney!*yrRIQAVzVWVkxAG5Vhe)TZh>V3R-)8VERHl9LH+ z>5^4BG5(a&7WaeRX>J;-6Y|{e5A_40-I0Y;f#4P9*6nVf7}o-88Ik2vmG!8~yHMg| z2m7HVI;)zSrkn%h@PLXYqLE<~^I`&q1fUQy%2QnyFP<*dgu|+z-rzulmdx z)0@yi%$GQ-F+GOM5FtjzqCvlCpQ-{YS@Uh|&|@=VI>dEy&bA@Sb^h~m&)(JLTP|`* zj~NAnmL|_8?h=Mg-CQ)AtQGx`ecfHFHvHWwZ_F>XaYO5Nfj3v<7T$H=xg zwtak!UB)=>BxS^b8M1|tpb!#Uu0cbS^S$KMt)C3G8#H!#Dy5=~uyRSx@B26Eo-OL< z{OafD>XjCSPT#r-q}xcC82;vIY#a_gE*C4kI>ME8JnNsC(qh!#qOcsO4e@fMqmk;v zm<$JmP&%xGdV2Z}qVMnVa5H!mc~bb1%CK)<29GrcuGz~y7tuxs40Ce_p+$RTeGLuk z%4t`YlrK&(d*3icUrJsrN4J|pAf{5(F8JYGyRbi({UGstm%HX|yr2kyPLoHyFtgpl zt~BnO(}0SO@(<5)-T9ct#CW>L?hUQ2tD%AuwN5*dIV1>9TZW&k3$t7lYng4>EF*5V z`v>}dGnLTia!tX2eZzuVYkD`h($sIzoNgI`B=wr8L{FyUb=-TdAQmDH1UxtTMXvKx7sY`LHOpD6Q_XafRf*2KX)tzx9=GVkX4N0ofpyc?yuLlT^GJA zKBc)&2&(lKD}8NY!Lq=(O(CoI@RfNrvtT|DtColH;gA4V$FEp4gJs2wI-Khx7x7Td zyFfC}Zc;-LHnr)gkU&xGha;>Daf1g!$78$Vbdb5!cri-`4UW^IYl$Rs1dhE)x?AF0 z8mgg2sQ;zNCLtLcm-Lcu* zJgsh*LIz5*mA9%-iYt9uavfW~Oh%VGj}iNY`uY{160tF)!q%hF3C2;d3~`YGHph;7 zVjI~;>%QmX;ZOkX6MwJ8`Y)95(HoSKL${4XH|FEzoxQfhrZ@oo5mc$*Sd^m)ri&WBR8pYwG+{4ehn-lA%F3gOL zhq)v))B<5VLbYg$SAPdEOLzXHQ)4LaT~pF9jXEeBl|ivUviKG2GcV3>kw=b+7g{~o zG+H#ns9b~F2U%whPuGo2qh%jBYPMlLi;9ht9~--ldis9ly7H-h;N&Y59UlvqBud3w z;!9dil|<$ESJucvPN@Frk%v&krSR_q5lyY+4sX{NJk<}i3TTe5ofXeI$CUHf({e-9 zu?WWJh$Mtqs^^Hj)UGg*NZ48NHrucVnrAlgv^Fyw+rsfvX2>=o3I3l=QfbC=8pBzn zn#Qcy8)WyF4RNQJls!`mHN)!~Ysh7y+)_1F5sIxd&kK+!4 z^bqOJ-~%TupPwr*F}LL2g$;LtHahdG?ISjr+`No-AG3fscVgUJIX+LH%5aaUvcoRI z07kXk6_gRR#IzSB-o38o9w!o zYgnOXq*Lw|qnXX!=VQVjv61dlLNl7)`J&xZC*FhQaoDIl#(GQt5*{|hs%|65yZOn4 zr@tVAb@X<~Mlkj%;Ah~EFCPn%D23SOJ0zyidF@!&^EzEBd)4OFHN=O+l%M*c`6h z;`WPCp(?sxsAl`dYyfLWm^mqOX{%+i?o)%F}i6I|J4sw>*bdb$vl$ zgNvJy(rC2msF|sVcYNc_5v!&HG{EA83Xn?8<~AxtYUuMO9^soCdX>-e>d8g2Dkw=n zDy1*sqgBe7vLh-ZsM~Wpk1&gTd9?(H*i7Xt0C5$C6gd@%hYxDQ7%L6$C;He(nqzxDU=Ds1s+?ah7Zl z4nH&Ew&D4C^{dY_P15A_UGZQj+Z=+LlBp!dJ&U0;E!z#5>LSs#icpJb&h|07#0kd`=?6l?4on~d>qO7mYv#8sK z@-aa>uk(-fM5IG8MQ99m$4fUfYOJU6jiQ2-py?`yn+K?HK~QVH$dX$-hx<8axQF!g z&Z{{d)D7VC;)b-g*R4WB2%WabJGkYc+viG&!?6)rL5I(B`I$V;JF`85E_rwnTu1zc zva^QhmJE|o!@i;J9wyGWDNPi8-P-MhhR%X>!ItRG%UU)Xp(p#$igGi07Ct}D@o^>B z7^%c)dJYh)AH^w7?PlAK-5}wK_q+FVWtyhr)J{qFAzeaWudFP+Y!r|^1&@;w zB}s`eE!0nsNp-}w9EXF6Wsk;m?dP**M`_I+?NWr&U5P`_g#!evam|T&^(m(o_H$9! z-7@66B`PQ?C>G{b)N}&&RfI<`M&BNsTFVs4Vrj&HY^`Ql+ObImI^i%8aec3s02`Q* zT9ONiq1(TuAn-;0VW&{HA8D-cwQcZ|?qpk2mHZBI|MdSyrKx5^UDT zjpvffkgw5540(JhGW^6`)HcoBsYbtLMWm#A5bUDcw*)PQW8lzadJLvlZZ(v6)THWz z24kgJflM;6uHdI4D(iFM?(W}Etg1fR3TaeiY< zLt}CtjocswALKgQ&wE5GU1Ge)5IJ#FcuW zNY(z9#VcBXFQfBz5Nl^m8%bl^(sZ2K&-aMNnY8g;FiRhS07?3I_;fSjT`p2DYnj~= z+1-6$Z0t!X`SWwM?XrVcZwNB7wbI?4lC`yxv$K+QqPA~))#YOA?m^#_=fJ=^GYS7P zJHw?W-WFa?H?E*E=^WYKm+ZR`Jq8~)nKNSAuY?@!Z{X;K3Dc8-qTIPtIyjBTJF3n- zHX8R1rxEDE6y=SY@^tqaP3au$KnITFavHk7ka3v`(W>~bSAg>- zj!e6ldM?c?P}aAyJ+QFXHM(ijYGM}zcI&5(G}PxhHUxJ{4faCHVj)g1eKk}_=c<@J z>Q-*2ym^%eA@4bZRczu-HHy!YB5J2{jt+4Z&Q&VOT*E7v%oE&ce2UmOaXdk>0aNC&H^x5R;X8{~emDll4ecmKvs3OJ zTf9DE*X=~RmhLfK0UPaddUV=SHmS(Ftg_Bpm+$J`r?Q`29da{sz@=}-6cf~b|> zDGDo2`l6GQ#biSjl`0J>KNFBPhL&{4lAxOq<#uLSILdWwaSpJHA%ILp_vlG-R`I6Erfd{YbFzUN&_hwedM?>H{uBP%%OtL*um>DXQ$a!0eINXo)F;wmFq z#wzKln3;W`_<;Tio2)?xNWB`rT*h7<$fPI}kVky_s?9pDtmNwqOwO#hnPq>y)!<({ zF7)Yaroo^el}O%ts!fPoL#tWW3JxgkL?5P5&FJyHUtw+q8nxAu%cH`aAD=Fves%-r z)xNT`6Y_Pe>f~QyP*}R6vnsMCrm^f&|waz2b>0yhEKTiQ+OwJ=p9p;T>X! zw(V&~g%ahD*_|}d!f5acYO`-4uR)s8s^ho#>KS9ai-#)FM*XM1Gy-g(%~tQ7Z63I| z;nF1tys_P3YRCHAGS#JS#1TC46;Q6rq~|QaY2A&jDwbEdIVrg!mYhm+^+0(xzeA!{ zp~bG&F68j6I@5j+BGkP*-bV^D@>@@)ER3d`O_YsqxP{zeF)Kw31S5>YAff*n)M@aO zuiN)G;m-bsC(`VzsHIK-5rsiEy4DX0;|p)6kt&_|Zx3U(KJhJ=W3cEDim4p}`1_rl zMMR0vR3+C)Xx^)Wz-#Ri_38@CGE4|l&EVbEi~*`*?zy>GZW93WdNVQNb@NE0#u^hD z72Dl{8Wa^a*NTPKS@pn?p4pt{x}y4xbOTS@EtUuIJKKtH&k#Sw_I8ovWJIYfX^Bs4|xvuReRJCm2+##-$95 zCihK7Afe}W7`a*ZIJ@BmB2)R+#r)@oud(J;nc89^N6poygO8T=^M+<7(jcyV2N}4Y zs+hC!9U;#m$mjVjq|RSBiqgRE!&BR(r`DY}yW}52%$$T=Z>ZVVZby{Q`s9$pGNtDX zY(V!*5}GyKD}c__wHMOIkY4MEs!Q2pDchEMHVV(8EwVIZNKWDo1LF%#>WV((5I|}F zfYWrQR^%l&f`adKCfQThdUY#%CDG9+D?IS1kCvB9urY%nom$%5@|v%ETT23tXGyQ+ zWWOlqF85aUK?`WV=#>rWkzDau^3VrXQxNQI7sS1rkPgiyY}E`mkq+zMu(VK7X+?Fh z#C5NOvA`2%ncp$9s6#MBuCZj|@GV*DIjR6K9Mn@LA(5nV#^ zr_Y-xP14tU3K%_RM6|a&-eJpGRow#fDNP!*hf7^;kr$@2x}WHeo@&8Ouby1*IYuYM z(4V-xggmD|HkkMPqHs+kBJ7w-1+}t*?=t95DNaE*oc6qJE zNsdx}ZVo)pS}8csI-ip}bS^&G@Qgn$sCHaCXeu9gtQ~RthKcV4U|2=#Su!m?@Oo%w zWb=RTAodRN0#nqf_bp}NwWNSC4+al%;osv9F7uPBz^1oW0 zNfiTTbB-vlu!}5~F}Hn_$$h9HH7z_~U*ii3Ro_jb5a+FXpzdGT_Yg{9GfA3#9UgFZWX{*dWRG{Y*1zPyYyt*o{8GpPg!@4!IfO27Ran$DD^&=zzqfdUXJH zMa%>P=xT*kbeC9yS`t1Ee$P$-_vOkO>?(vAvCPK2WhtEXI~az``wfh=N+5j2?&^#~ zpYfGnsRvL2wlyt3AVbpm331Cm~w z&k$OXrVmjc&1YFdVjVFXt>p_G5~Lmx9%2M~OMU?v$Gw07tV7YUs`|;yccLD4#^Jc> zu5p77G}VH>lmZ?7AL~9#%l+nK;AHlrFADPVtP$8hu(=HHPD_Kk+_-=NtA?Y^`|0&- z4yHX%-V4cbB399vj#8yvSh8Q!Nd6W6Iu+#oOB1wV!w>=9H$E_NFdcH<5{s#OTfZ0i zPO0B7_PZNu0~dlm+#nuQJORM?EXVMrL;LM)fV)jDT%V{8xvB^4$V*tL7mUIn?vdId z!$@P7jf(y$UzfsdX4ly!#K1e+9hAY=&HXnzg`6C85Z<4QC2Gvz)oVUD7M zyWje$a)?U>{rM$Kha<*gz?H$%!_TRQ{b7op`qKS8KCRSh{p0ubMnK4yTgs}k+R*M^ zPl?8<{G|+y-7!9a#-rIO$UUzxA}>VlP^=*?QI9?_0seuIYFSv$+4jMy`>%a4Wl_tK z27P?Sd658V8}3f|?KP84nLr0NJ>pm>e&1(UG+5}EnIN3{kv&uMnh$DLA{hW3aDv(<^>j!}pHbcVIip#(6|~2gGo~r}Ibirhtm{ zQA-mbfPBX>gLK=g;gA6Hh24ByCy*IEsRj0$y@0Q9$v1#=B{b|vwqu#vpt)Dhn5vlx zW12Rm0g+4_&=8@dyJZ`QOgH=8t(8W^^h|xiJZK3%C-=D9icARL4USGl5Nv+lWa?tc zk{fQMwB`l{3{zw1B49(dN*0Et17$oE=YB#>!#ClW?sD#0gK+C%Fn|%{`LBQS(1o@c z%Wf6_y9(w0W{gtkqZ2}=U1a}DT?(S~EEkJO0CrC)p;O=h0-8~T8-66QU>$f(~ z>Cx}g-Q)fG$2lWohwNE%)vT&nb8X1DZWcNJpNz3#<^VMjz zRDX8ecO53v6M7pfH(+)d9}F*T*3oyepdzwQzL1i7g55pzfwjUkdGZ;w+6qNHV>ns_ zcl=@aoZ~2knwie4@ObIIi!*~<2F_=5nB&{sQLryd-x7#s_$laGXy-xl&CVBG`!So; zE}OaQ+9U6^YZ9OL9QI&5Nb6y_+Ib>RT%Y!;7W&%3Ua)ulz)MEJOh*ySa(Dy}5GQ=E zAU3^IiAeVKAV=UUxWNpuos4=|Rw5IE^$vTUkOc+EATGZ~oHpHiw(hAZ7Vl`c+C$KoA@r z!UGKMD;*fr2l!pc!;uLebRI(4V`0AAkxX#I)Ug<~3CI+}?DB86o2Bt;273UbzlS#RAWBnX^G%0~Jdj=<-7i9jv}HJB`l>^Xl5@HRe~p43 zINHT85awoxNo4{?_*vS+^AH3&|o7lPe;ng-Vl(&^NAmn4G9$i(hXLxu;fRnaUCJ1QtCj zL%GS3Q%`=mhJ8MMOQDZUo8+5%YHLItNV}zli%~u>ULt+UeYPLB)1V%q)@Z(-P;a!0 z+LHT-2}lO)K1lJjroo>-!tePZ?TfD3cqGH;w8OdRA_ROdv)23D1Vp zHhIU2P0nJ;eTA3P7mxVn=(jPj_Q2)Y9%_r*Td*d)NHs@;sk7-us5H}(*)jMP6O8bU zCzM@+k`EVL)x$~L0gcb0?x4ZmRs#+bd-G0m`Zu}94|ni`{tC#a(XnTn{&^X(P`8D)v!`quqiZBpuv6&7MYn4+Dy{Hn z8q@aPyx=)BT6aYLU9SVqL(YVEmmY1w1N^F9%*e~CTu&HpnvN8CR6*pP+9ZQ)d*5E) z-N%u;myG9Tit5oN%eI+EhztLvt45Bk74!Z|*(VMKbq#*AJMIYq4c+P%69g)A8`XPQ zh1EL#i|{>H_qvwKmXwv~V-0!NRuK=*lfakXlf@1LqasE9dM|pU)^|Q^{j2?jHYBFO zZ(J?dkb@vy>mvS))bQU>6QeMtrk^ozy4*5w);&yyTR)?QC${$^mn!ho&E(-vws}Z& zb8ID^=i$e+0W+rZ@lk6r@iy4LhYz!v_i7Ad;^naMh0Ccbhes7La0V39a|RSmgm>=O z7#dzd_I~9!+P%@WvpNM>a$a+Z^4>j-leRVJ=E?B%ftnm`X`ZHBAqi8lh4DX4Yi6!_ z_||1Y62*?4$PW4VbR($YnTBKIhYf4coD<>s=QTN%v`1!?ay%AZeBt{=1cA-x-*T!3 zbgM+F3h>bnwD8NE%>%1RkL+SI>|!ws@QuaHCCv2~^lfQ0+glr|Pj9cCOH{h0>!R&^ z8W>yYR+BI*x#^->63OB>SjDE~RlCuYe>eGmuS8&)Wzsz7AchZI=&{+J-W(Cjt%*F zp6C{5W{>h9#yXBVQfM#Kak4l!nb6z0QO}@gPid};TurI6!O^YkPm0e}qGhE;M(Xo= zXD{Ha?KMCD6J!1VtF94|5(NpcKsp!Z|5DHeT%zpm=wxgyWn*Idsu2UUUgiLr?G8&R z1W4$NsbCot9E^+|K%y^7RS{+2y|S~Not3dQp#K8Y7$7YhXi<;`%)#8w$<_hTdHtoC zU;vz}*h)x=%IVwvQ!oa|#nfThIhbG>)B&LwBk*G51To=^z;(=kff3O3u)#8eR1z#8 zD?1}QEC(Qr(t!ocud)v{;)G=dT?*I%KWv~`fL!WT&cen88e(CB7s$*B zNIy6jIe}p=W>_xJBrZ@ioXo7CAx=QD2U4;y0YX2(6$i);GO=;817;?`DHAhjWhNj! zzzzspfKgyUrdMSg6Hqr$03h!m0S*%jXjvv8K_Fs4QUr_wc{06b$Ml*WJCGh1D`;sZ zAU~$p1eyNI5U{XufwE&_2Q-YY!a`<1PYCi4suD9$6z11zGQXCS9Z)K=v4Wyx23BQ$ zRdO){o0t#0;n?L9v74VFvV7%&%e?W?&UJD>)Dl>>;0^VJ(C$Qx)BWMz9*L;U9j z$Ez1$ExlW%5_`*nn%C(c*(87I)eoItcJfW(rM z9qRPC!1ujm=HX zoPgZ~tjfS0T@7?YdkqK>^*i0GARF3=IbIVEU_8}8V*s=iG*Pin0L;V#R%87bm_cW~WRcVZIn zllHD2j>WQhMd1%;DmkVG>a|V=Ouy&Ow%_hD{yx*7;pwqwYM1_Ey-3}B#iKSa@5p2l zU+aws>WcAQLVsZ4!Oh`b?*aY#vN@iQzsRm>Pj!(2e+|=kfyd(~LhwPVL-&IXcaDq< z!y(*_2X}Z`6b#B91b)2U1^>e18=}S2TNSGdCVo{6j~BZU3$ve2N6< z7k2dSVn08q1i=b*quk7|H$sb|(#=&h#Z^S0D1BFp^z(c(;TH>0<(EyLg+hsbAuQ>b zi9?yzcqQyC$GZIWs92RjHriI|SH4CYz5clG)#nf%mjup$KPw5z&E;dTZV$ zyiT0_t`DqqR9&)fkn}`-7*qP2_ysy7z5kXVW52X8j%@K)J7eoNv5BrDeFDW@vJtpH!QYz{Jo86|{HB+i`&?!z8w20o$o}#Qb|tu-p($_w;Xy+5%jFLq6NJkjA%g;Y0}9eJ zag^ZbZrl=`;+R$=!X&X|l0`r%hRG#(9Vx^tW+I981hJbUo=l|O2R(okU229)q#mOd ziY0u&&m64Fe*S?glmi!^ba9NwSJNdsFs>^z`1Eeh!qBrvbCajvW-Q`!Iurk>O`ABO z9*Jm=JFd3n*K&05X~ZFJOpTi)9{#7ogYxF<v0cVPSJa8iaT%a9c7;5w_WWTopMWO^l}=^=W# z2!#YQV0fUkOi^m35pMr>ueNTDW!j8fWbEtcpAZb^yKb}@G4~m7_^vDP$ zaUnZB&%@l3F<*Ab&qGM=twN|km#I{mhYF2CcPy1ya1!tSrXb^f_&CgT-fZBvNGycr zY@6v)KVef;g-w9n4QqTm(Uv)Z`wkQQGT={Xr$koj00K!T3phGA`GN?X*e)D@(R-A& zQgBP-u#)Tw*u4VIB$jVUlg$JO?;U>J;J87QE9hgk45wP<810v2*3Q@Zw|HiOA&(i3 zd4NksD=vvoXV)0<0fNMVKAzLD$v}nkx9pjj=khggp zE?*ck*4QCvmtaYb*0WUk6(<~f%4PPOZ%vo50~Pyttswh-MT)oCGWlu;dm`Cl8%mf9 zHQbx;9K7fzSMnURd_}H>X=z!;Bu%kqwC~fC!Nk;qP`j?j=JEpK%=^$6{YwH=(duoC z4aMu;<iAWrW(uK{#y+ zCy`HgsqSDM`0e#{w#u`WTW<311UdySa^JrZ@#3Rq8QK_g=TgC7ucfbh1@AHwLz)gy zL@cDluocaI>_iPaYO3IF);Kar{5qv(-h{&9mQ)70Tp2{ej(JWO?Ppj(9{1BgJw@eu z{reXzaszQRc{%?i9%iOxkuGjl%1$XW2J*x7AaXXuF8Gz-aRv{bn6fYw(h?OlYx;}9 zX*eIrg$KIdYUwlBB%rVmtDF1+RAV{#W7jNTwh^>=_&plGWP8wK)-Ftc@u+e!yjhC+^EtF3TagVX=kxd^f`RNwSW=4@ZI@$5L4fP}`*{y(HwwZ<1eZpqIjb`I zX?*bDwLevUYioq$^On@G)}dit3!wxOtRzJx;lC8Zx_;xdD5p@ohem6jC_T+V6p2ZC zf3HF37tHz#4T+@N?b*%nix=ERv}vKgz=Ib~Wzs0@NWiyBU((=iLNvsC17Ud*kYeop zday4c8fyCZB-GivOjUYztzZ}up(nT7TYmPbI9^6;EUktdW8bl~&Do-}VmgcUh%f=F z^u+n^-@~yxzR%EA9???08ThP)OXA>Ci;x+bht?WsYb0Eb#mjS-UIX@%F)+ATNW<7J zP0rI14B1WC7}|gC+@$S2Zf8Nb1uBbulEzM+XjXYHpW5~186B-{-KoSnZ#BQQOb;An zxy;FUH%BlzSN(IZQ^3)=$1=5Z^Wxr*;Q-Ic1 zij@qy`27&t>JQio%O+a@!Pr7+w9Rg$?uU6&@hRN_-M9I9*FA4lFlF|!eLq$&^i`qN z+6!f(wD38KU#sBac{y1aXu%CfGMhWqRt!c46FR{PEOrgCg}n`<7YQm3<#6dPu&li1 zKh!=H_+_OeEj=-GpB<7W&cBQ-$I){CP%-M+uc&wts6AAYVM6PF!(_Y3=5wl)?IL(k zv!O*{#!^1{_*Sb`d7RSG8g9dUNZ48zYJsI=!Gf_-YX_yO5xfMxB5%0GUtQk@#ZZEW zJpvuyWsEuz0mki6Ufyr z6%TMBa#XT!)c$xX@aH|?d8cWRW36<{+PhoQ$F#;e!8~~NS4uSw+r`ZE>P~T^@CK`q z7Zzxc1}kogm#1|87VT9xTaQ z=N*zNiie0!VsY@ETHR$^>(dV^D*==?=ld#FP_w@{*|>XJ<$|t}M4%6GhbDHbDo&a< zDW>t;koMK)+}sN15U+lw1QXY3@&ybE2jVElGBt>@UT+3wEBa0T9B4~phpOc`$lkg; zH0iKm;x45j)H9Y*)#g{OKb`LuxjTd~x35L*pWd#-p%IN4{GCtVh%?FDBsa;%oBHJM zk!^8++j5^hUb(C0iQHreX{Dcrq%wIf;`z?b0czu|MJPka8{yr>UtGi}!E=OJN}`Nn zr+<_+_byabo0k#mckOs~7tcQMwUR%#*d0Ie6S+OJzKb&UnYPMzYB}zbMUf&)^K~$^ z*yLfqcgFSV5Ern_p|Eo55r42|?Gi^|x2U6H&xs0(c29@C=-w&r*U;IsUE)uF?Ao7F zm^P@-ome+ejW+9)O7wZITF!F+z@oKi ze@GVNU7_{W=6=(8&-IK>>6!XZmC1=uca0C0i!(E;B&o&oO6v9f=0osxthN`E%~~?j z$r-^u&8028r^KYHQEnn`pKh7#qrM3S3}Vk8W|zgfQ?a2$|0MmE^qrq@qigMIW_o7! z!P0@WT%C?JDOGcR{y8}SuR=71ATibR^EF&GF7IOUL%548;SmpiOX2l3-E}{dsVQM< zOG|r8%SLxcM{s!Bfi};)JEOZN%y~tm>^l4$Z4z>H=d}%!7;*BZ zC<*Bf(Qn^R>fYU3(Yzyl7)V!-g^!GP4%j9lL4!sbdaIc;jdxTm=4I}7J0no00oket z58+OPCr3OpjujLV?5$$GvTZ+#4g!v1kPcU#I*gQ$?z6&a*oBJt=F{^2{8}18jjKhF zqDoy;iiB|Kw;-hwCMEm*BTcK(6MwDekLztxtiF(U;E#Ilh6tO5W{$QB1y0PDH*`FD zGAZi{meoSDQ9J`})w!VkFNBMmwJhcTw+KqNe&qiND zaOX@84k`$EX3FtKQ!4A#r%LB%S*vFVa40Q*9$9DEc301gvRM>Q`1E>;uS4fzLAuC_ z%W<)m+0#FmS$p!#v^%QhV8S)}b3tJAZ!>;Vg(Cd8ae9$f<(J0(lUNBc@uk}_-{-kI zwf}M{en&Q+OZHN^Gfcf3u}LZ|}Zc{OdJC zA)IxK0A8Dq%XORGZQJ|`iH?qhM{T{=y{}F)LE_9xwGIWl4exkfYYc{ZSlV1rOau5H zhH1A!WfU#BdJ4I^oJ7KWnuyLhqb=lhP3;lPQpHIG0zN(h2h0vm!Qw(b?ne1|gj1>3 z^c788m+37cixiV0#C#I=wtEO3jOI_VRLmbFDUj5;q^^6#B}LbXQ@`Nqs5_}~e&9On zA27^SjMdr?S~-mR<}YM5KZ%HR`LFyAGyQ_;%J0zZW7t*7rOulEu}jXU>IJzh!@V27 zuR`m8ajBxPl>Aa26wtjA|0bT9ZE5b@tc>|0G{Q=k7M%%XapA}HoWO)`b5VhK`X6_F$N-o=Z6+e`;l9@BMe)v_H%~Y}IRfTKW+MhXa zbZ&w%ZEo%soDU7(RbOZ2-#=e>{%AYh*RkwHGLq8-f8HON=_sn)OMjBPoEW2NE?ur`D*0Jg*R7mpt!^UT>{?=Q4eP#n7L+D^AJ*yfBF>YB-6N#h8D%W2nVM@k^3F1BbJ;ywf56 ze1*M{?ZcV=mlHPa-JG-9?~{|?FVEaYw=M7Ih7jjYu!jzfu2YUZ=oMwDa9msw8|@U#SM~FIrEP)`>-Ab z)J&RZlZ7RxnIji;H*}ywoD8K+BGs&H zckoz8XX)tE&Pub(YUOhNVb;R7i^6g8ue$Af*k)Tpx#a+*z-iUxqOwmqn$51v&!XD; zVigU`DJJDqbXxviwSQuH?lj#JSGf72roKrQvD(%QrS!E-7D37q3LF`rd#t-kKO>!L zG-_ALvDnqjYvVK_z6FPBI`r`tpI}zJ`OR zn%FMp4b@I{U-U`nhdSvA&o(x9<-Rs2mC_ycm<)oWUCt)&&2#h^F`fz|a|KORbrZL( zKl(h{N20DiU-~24KAX=kTn#B#TuIZKyx(pv`}|1-^XfyWOVKo@jqZjq8<6O^@OMug&`%#N#|6%e&HG&t#tTW#6sPw_2%Ad2^tBJNpaRr`s z=gkdgQMYs>RG*GC>!bn>BSi@{QM<%#J-Iy2+E^O0g;5406iC;#(jE@Pdf(75cmf(- zPZNcE)sv%mW0hxTg;#Ca@MMEJ-uqJo_jgCe5Wp|GQj$)ZZRO+7N>eY(6c)GnbhQY+ z^_zU3U5p-cFI2KVXo}P%L{|}QdGCUL($mG96t`CcpDOT_HWbD?mhDkZplu21B8T>q zONVnZ!(am?H=pA#F6)!xeQd_9u&MIpW(ZeBV^_Xgy>>{bJ?PlM#{zO34HMmvF0+tqZ* z+vY2b=Jux$;aXw%feSqVFzM-_pj`_nveUNv-`Gw+pWVJ>)(+FPtwi7A${I9OL>2fa z*a^r|i1~ints7>w<+aXBDS0TZ*ATPpusW)h&P6cYH2snOXG@~1L_02XhsU!mj>h-C zkt+-7cNMeB!^XFfR{W&`BH=mOZPxF>XX*K8WxZ;F?Qb4QsxV~aPp#aZn+itKnhYW4 z!XlnJE{z^D4;XP{ymAF04{gWL9{E$jY{MP7sZ_&zI^-f`(%WiSpfJ zSWjmBw-w5crn=*+NqO_FWZV}+sZ4Y8^%=Wr^SBzEN1a4w({-HF99dV6N{g=UEN()7 z6XrPwem^>!jq+be60Oxq>HHp(mNacHJ)7XF>t>R`U4^aeGA6)YS&%ogcsMgN-~1h4 zL&j0l)%?-52~#6%etsF5|Glfk5>6J{Uc#ww!evK0=gPdhXO~Ayg!SoFe8SDWr`Oh& z7yp-0^n+Qw@yRb@0xQbKxR#b2@#eNA3#pXI;GPOgNcrx>`QIdQ;-_5V>o%bd!Ui1059eH|w)iigwNP--sbeDj{-FCFoJK=520CDyt`I zJrKw1X@?%0U61d~79&r~U8(C#!_o9`W@ z4Dj%UT^W~3+lZ-&Z{PP6wV`bDq1sMX*s<6YK)(4RuWKU8tT{=nHt8u%uu*^_KX7v4 zm~Pi;FfuX9OWLr;zhtfiJe_L~&VO4zy@qEh6#e>_bTvzLO~yjqs>i`K#~@x3Rs8av z)$H&XIYgLcQ!+XCFl|F{{_no+wPIS`^;6%E{S!2JvtN0BwRqm_Oq$zPVLmA6XIUdCUP7*&X9pU7wAAmH z8y&EZk{|TG#Pgne(Q4!mDHV@y1Q_v`|CZTikmHrdjA{BhWZ)FO2Ilh_p>p9*i!CUJL#ZHRoFrEfmc#CNF`f;#cYx!PGt#f!~I#X zYcj~D%=b^qXvEirjD{#)2^+D{8va=FarlWI17-Q|u6vtg{q5YA#*HPh1^EazD+cGA zRBaCPmCOfNq?3_ufu0JHnc7^WPx=z_JuTxFy`)yFgCeyRCA_M-Wt99G*}GhP8uWu( z{9^PGtBJHsg$WcZS*yP>E%~X4XduFWMTg!!@?tt0&nXjk%OAbv=&!SHQ!%mjg19-8 z(IdOJ)sQbpZnZkH#hm@=;AJc(vM;J;^wU{FM@}=#(pH{p=IYKUS!-F7s9fpC)AM?C zd}Z>mvor3Gt>o-)vlU*18(~t3>(e7A;PHV$iJhN_9pSU@b95Mv*45ugzeAlJvtUYi z(W1pBKJ4X_da31{92HQW>de3wcxGYZaPl?|8s6`OR(5zBjMW`P#6=wF&e!3%-aYD5 z2HOtZ%F!!cl4at7pZ1P$8?uG^X(7Vj(#-aeHjTg#EjPJM!e>yP;&bK494Z6Wb%(F! zN#6HFoJeg+`%crjCv>57Yj^In>=+^TFR}x>)t5+X{Vg4hzU60A?)#zghAhKBUIJE7 z1l$&3CWCb%5cqm_aRpy$X5^eR;2H&i#Mk@6p+! z6yL%=ggyEf6CzeU1~SC8qpTvo~f z2zjkHLH45io7q(AUJox~S5VV4SQ7phFE*Mk8G1suK-P(CZsg7?nTnavbAliYHri9} zx`Cho)VysZZNXj$_7| zMDIF!^^gNnDtd)V*nXCh$!53USsQJU{poF^@4=LOfLcGh5vqC_%yq2v#FawLAPo;8 z^88^0jl4SurH^{13$`1kQGfX9iusAC|AkS5x3`!GNpLBD&;km(2`nF*{6sKZuI9tTG6p zeH;8dmLgx$V_}X+wI9)YcqPum_&@k>cFA+W3kW`=Z))~syd8dnux<^I%`bjcAvM@N zk|`Qs&*>72^;eadm@gs)?I?HHoA1UR!MAL`j=r~_C=N~ta6iti9=o{-`mP&%?!8Wy zLBV`yuHs%X|B za$~)*xtaX2Tm|eyqc)dfc3Hju^0uED(Eid)_7+GpRkIFu{aHzPP#{&SXBQ z{1L1A*4ydWWb367BE`F1J(vC9Vvn#-`%X57`VU3s6LH5ec<3`_WCyQyf2Ka2rheb- z#%wv$_H2Xl*<6}i!kvSn=nMDd32{=ky}9?X0!gl7h43uF>>1<8DT`=h#}3&CS6G6X zEQa9R5nUMUX9!Ia_}#Vwe69C3A?3Txl5`^5GW1wH_al)CCF)^;cb7-x{L`HKBmTf4o(8WG^GZX<-J>Vg31kGwzq?6la2r!OX?R1Hk~!hH z+IzR$LpPxXM0f_1O1pN1yiMpag$MTB`{M3)C%Z@2!4vez@10l4Hght@pWknoeo!VH z_~Oh}207f0ERgUgBv-TK+sT;0U`mIpn-9v6g8JU`yMuE!+wMf|e(JGj1^Ek@-N%B; zUhrLO7%Gfklq_DBZ;d|Js$ob3y)eA_V@sq>6PcQ^b(51WgX2dZ>jgG=GIK}0!J5Z8 z37cy8n^>+`Er9To)JT_!r7yu}s7E8XbG6Utht6TT{O<%D39hs5Dj z6h^xc@3;SqvkR5{_04CjL~wE6VbVAE+(V0SvhOk7IhjbeBdIHqIp5;Ce5(3l)`q$vzUGT$^7WE3V`_2>6QvRnD@GkSf_DAIY3Lo_Qou*X5YT;2R(Mt0N_d_60<& zq`pnR8DnOp*lV;3&BgIfWE^W|6-0u*lgPyVte2^HloGYYs{FEP^kO9dj;jZQyCl-| z-c{|_EV{mx?|Z*d>#uTyxo=(Cg>Oj*3Bk2Uw3{z-V)^{OaPh|x`7718Qj z%!TyQ#}QdAkfM3QgBe%I_J*=IT)Mb~+eECvpSs)E>6yh$IJ=w#$PpU0pt0~072nvdU;Knu07)uH`9LE+h#eh z<8h`N#>4(FQp<;w;f8s~JLummNhk^Db{?7b66_T4=t4El%V&N6ac|`JB@*w`nwQci z8R6@y8nr~t_~X0k$&gGOTtEj$iK<;2J5T4=pg6d1e{ zjRsiLG-Q#^FDIbLf=^=`6UFl^TxdG}R&(?c<7A+Dk9+X+lJ~BRTPW+yyZJs~GHa}&8fr1zrBKUi)*tze5xx0_G@QV6FcuV~=zIz!z(Mh39 zUsRLp=JK;2uXaqNt18AIOI+Cb$Uw}wld*7mk$hi_Dh$`BX&QwR={_E6l=JH!U$)Mw z{CL$|lvcJ=;_L5G9Dec=K@Fb7)yK*YrpTb4o0+HZDmf?-(7Q_zP)qfFC$l~mv=?4c z@hoo{Pl@DkLpk@j(rwg}>pS>?;-EL)>R>w_w4}^Q=Ze`z;)+hbIr8hv7UI+*+~Q`f zfAQv59-?|G822y-C7U>jO$;s~j&j*PIY=0H=T066!_J_k$c)GX%P(6^3(Sp8`?;S~ zHA&QCuin{24SL-Sv{ebu$0S6COqpxaIc$ZYo0RwAI+jz$$C;pQ0$C0$`F)CviM*&r zttuuOZKbOx<7u#lq}i&7X0esK=Fw(}3snp>ARLCC7b*NaF(UOv%S*T3SqM|r<-`Ue*<652Wb57osi$%9D{D>d!%>M&+^vZqy13UUBy9c6~|Ic$U z{(E}*A5Wb4e*;g0pkLsTA^$Zz4Z>gnWEKE0*G_pWn%x8mu3gpX<+~Vg_mZ3eclYf z90PPUVEtEG8sKa}qyIun1KS9U{wpp0*Q)4X9coneSN3~EASKz0R9KaaDab)C3jf?rWiy#1Mh*j0cse?3SgQ6Y8^x~1O4;L z;xhv~&HTErAReC+V3t7^5CzW01e*U!HNP^;oPdDfFP<0}1vq#Roea!F74V-0KGYl(kFM#Jy`whl1Ucdi*mr0 z=_6iGj@$i}gQRD}&;=p)cmuR%O)2zLfwxnFL?#+F)V_06jdE0S(tkDJR72yMKK#AB@(Jyt0d}tX(lRg zb8?;U)MFSbEUJw*gDN~9KU<*vG+p>=0a5kg5`H%tHZzH>=nD&#yzeCRKF;#hUT9TulCYgz>asYs zZ)?-%2_M;*OW*vFnS+gG(I4BE#j0Vntv?x%`n^(4>(A&h0}$QI8@svN^TY5OY==i-g- ztZBS^wfO1VKnprsl_h>z0AwP@nv(@5$O?qK#zpqKRGcQY8LLkkpuM~rreDlr&6;f zA612uVTBk*{0nyEH4@UrTI1dyvmJ{WfDJ9>bjuoc%St|8y1dEibqx;Xpae#eFVLrF=*~UGeN|Ru6yZ27qEn?a>UoFl1vlJ){XS8f}pPg1V=5P>lc~Ux^b{q~}PHlGI&ON7y-yGq? zAfC+gv2u>hHW!=jgddk3rQsimedFRP^rOd((aw$E+nhkDO0@!>DxoEu(-i+x*CzDX zA!J-|wW77zr=fJ(q#nvTnhz()0?!(LLpiEnRN_4?BdlR(_7}nZV_a;EMPfq-3axju z)3ML{as0JtTUze|bB=7LW~O*H@1<)UMQ*M|>q*6>Y)P4GW!<0_AnJ^%F>B~tp_pPa z(bDW>T+H2YY6jmKR3U}eW*SMy{+xRgzHZEt1V(-%nNX;OgB||Ekfl@pBkBlpTsuU9 zRGyPGRfO1YDaFuzya~}0yKaI!!)UqE8>B7+rzMO^85bt37CayJ2t^r}({;&>zEOW@?5xh#*G>)bwzPVZzYqHF;%hFP zqP(%#eAmbytl4KBAc{*ClfacXTmtPd@D1EgBtm=-hV&%p~o1`fUakU^<8i|9$p z?b40V(ld1p6Yqwx=W)a%3uHaboZpF%bVZ~<-ww8CnwexwMAsRY_0p9#Ij=IRm0KrW zH}tM8lnn4Jh&DL0GBOUC^;(^}ySZAL!(7nYYb4i6b#4s}^?kyflN{XO-bv)i=EQcS z>;DATyNSO@k$24`?i=>)#D!Jv7^62E)zAh_%<)TZ+Fd7S4!Pza-iby&4wN-%SVimz znp}K-N{)D;dtN~>D-|F2Oin^_4y#P1W6BiyHeRT#H0o)i;K^x;H905Z%J^4Sf@#U& zI%5R$9>o5vMA1d&J$pm;<=ED$JGPM@Totf+%#9S#HccO1 zZL#Q9T4lqijom6Dz-qip=mxgm&Be= zZs03$0Y`>We!cpMJ&nXv9@Ss)W!@@sLyxC^m!oK!qX~!nUCC5q&^~8vpc3*a729;= zH@I~{DCpq3y zY&EPEdnRzE>3)|)!+Jm%eoZcUd5-CCcubj~6DH5@z7TOCO2UA~l0H+3zm3pQY-Lr0 zwe>D=dMm2e)>Ong7*aVPOAN8*@Iwv7B^&iUwXpN~EU8Znp5k{1IQ(frs66e_{ObL; zlp^NF@C?PpvS7wRDANS)#D*4P8;S8dxp-fR;Sw8gLS^9V2G2}A?7<_X;5$E{+D1ZF zA~PQZ{Yp%d>E*UHf>n(^XRA7Kafh``iPd4Mtvr&CN+MfA9z|IWJ-vstL>0J)dZ>yVm>d#$IEX!Pit!8AP? zH)DD35sG;rQXyotShnbX?qT3(OQ|$P#Ob)`ODu_>LPaK;(JHM&ihS98aN^r&sNh~k z4p!1!$@<1h*2Sg?L}@K!Qs)clMp@=|8kUo9jT7v}cb1DVY1%24X`U{u6!uZ$Su5K5 zqzlwcILdDWl%M$a*q7{7x=&a9u&1U&$@{%%akvmp=zpkV{FKowqnH1)l-CVY!%abg zjIj_ep*)Ys#zkkQ5NN^nu}XXzrH%_u)g#I5`x3Ew>?+L}C)b|*M;h_3EB%PSN2hI^Yu3{)jz35r!5e-sH~r`ut+6 zaW8M~kIl22haO&1Co9)*WnrZy>GZ=Su~@CH0JSKapin2@qd+>-_-$$qI|>u|x&YjZ zy4@D(xpVYe5?tKuoeBbtgIL@fEcUwmzTti9f|AV=#bv~PIz>ac=Jtwes$i}TwUU!Y z+ae~3Hu(~!(53FW^@SNclP_Mxq-Y%B(~(4@a8C&ORJ#(xKJU;l)DDE0Qr=d{;(vK>p;07#=ZbFX`Q| zxkf)ddfe4;A!8CX6XUUUM^eZDm-+^$*RHjcJSd@i>(*x!jymf{7ZDP?xahc*kh-XB z?a0sSXFM!7iLnzX zQa9;w8=K|_9mmUWRE-gfiP;kh6XWole~W?+pA70=k#7!+)0+{KIiDN;x@@4LrA?q~ zT&${0;8=e)c-DI{2VpL@AS|qfHK9Qi{T?%|C5v_2*r|No@^)aK;UPG;Z=~m8e)2?C3EOa@=`CWFYbo%xB|3S$d{;7QH3D|Wv`$~|Ihz@?DcorcbX2!r>U7lofwCr&RA>?UH+4B&FlJ*=fAi>FcPOE~<3 z7p)7a9JO1v(5Ga`5_A|86J#b13XjUo2YL?-+sj@!y_(Ep^p(MwJtKI9brb+>^sTS% zSmYv78%9;Iq$sQg9J~KD#@IJ-gD7E)iV*~%cW)b(_t=bcKJs)O_YYK=>|d&JO-i}! zpQ+!cMjH)FqTQRzKRH?FH8(Zxjn&PKnpsHY)!!Z~sHmtu9B=7f9$w$OY9b_k!P&o7 z8w;et^GA>9ZaZ17`t2%KiGA~yRhk^T8;6n1)K8Lol4w5j=Np6;0@WIoJn5Rt7nkF# zsHnhg?}uVk(~;D~ZCyO~4`?2rjqLx7W%(yYXvUrmztJPHhjh419xJ%4(NMVZFktly zT^lY>`N^z&AiZBO+MAvye7I9Xr8d~waQi-bAnq}z$b+iz3uI75;-}E_(n_b#!xh7mXs%!KNVS}~V!gabB2>tW(z^{v5iSg0aDrjdN8&jTO zSH!iz&Zr*2EnXsHd+wPS?;kO|+8%2t$gi#W*_UEFl5|+MqTX2FX~?i_fGJaIzTTO2 zedb_h)V1KTxg3QoIpe_EZLgoU&Oo4{UR0w^7IGPa*>H$SZ^wgySuW6d9gF6Z*4CdF z?YT}o={vIHd})sFB`IO*tWsb(YY*8pW(cXrutq?g*$WpB+b{!OWT;J&(3bMRxc(G zdIchNr{7q+j98WT%@Z{+mV?a{5YAMfK=mW#YU5|cK7RbcUm}NiAUS8DG9z_{l<;=Y zt<>YB*-c7OFJE83|8vFl{X#e-c<+J|guH2oVxKqMFxmCH7| zV{5qa4rlvY@;+Y2t3(gBka3XYD!}Yr{rR*}R9|BM{_G-XoTq?mBV0 z)7QA$k6ng~b@-MSS6T=G7pCw@UTsaa$~M|_(~fneQ!95%&U)Wljhp zG9Qe6M_@0%ol5CjF~FWzlN1vc{%LE64nQ zw7q3iT+xDViv$P`!94_mOK`V9aF-C=f;$A4!rg;g;qLD4?oznByT3|Kci($XpV9A* zCu8u34Qy(yy><=u$6WKvyehD8L0y9CPwX=%^RsyNVUzN~(&JA&Rjqq}6>vspM4y-v zN>EQ==gJ!-8>En)PS{=Q{kz%B)$zOP-L-OiQ-0?b-QuLSvV4Q3=X}|Nr9aGTV)ok% zdlSG|O5-kKZjNXK$F_N!eONn&K%7nO#%QtNJo0C+0G%D3>@n=IRA`ppH-wJ2YFJuP z$^;rIcY3MiP&rscsR-GgHX$k6XJ6#Xql)jJ$9#acYLA!3YW_LoL#-;eH_K^_7V!Z zN@ZNJF)7%69c&{v#Up&;M-TR8l8X!2MoU-{I9Ke8b|q3|M8%h@t||Zxvsa_C>qjZF zW-Pa8>wh=Da;3Q$eJo-N#Zew)mvyO(^QzO zZkME6%#L7$Wr6)`=7TFr^~PrpQ=hE66_c7Wt$1QE6>4sHEN~6V1X=MrQ>uIHu1!J;XlO27nH` zO*w*oa)f2S+0~Vv&><4O*OkQQZs&FtjHP_QKH#OBD z&6(>DZ6Fp`*-|ee%Na(^{hGteXnts>sgacMsRbgFZ;1xI9gz0n2C-OgIk3U%M^>-- zSAuXh3MiN0)+>Q&OM+}JI%$E<+;s|QM1r8P9Q6e|XhdU;IOw&Zk(5adhMgtCInejS z`r3+-hS*eA+#sfO_S9~HKEkw-GTCKHJK?0qis}2)yWk$vO1q@{=Zj;F5r?v_%b1wQ zzSp5v6<}%V(z2%f45-z>c=Tthi-ppk}D0^5F1dLM5PgpjDxjwofyP^T-C?VdTq#>in7r8xsou@NP!GRSEPNtK&Rt zp6TxjLDS1aXWsa@W(uhm_r)22#Sa1d`7qD>Y*f#EFV9b>;7&N~TRJiu~zVUd&VG_A@&&TqfW*?}p;u~96uaogx!gz^qS9G`P7aIi+JeqfahIJfE z(@7_3GILeN^o9EB0llZ%YVYeX$9}m`SFsE~Y8xD7$u+UTG>`f^ByF|9F6|7v@e1UO zBOKIj)WkZiYz=Y6?YwL~L_}OQI}Nw*5Jhy3MF@$BCAnO8CMCg%#jW`V2-CyGpC(#z z%|JX^_}hv#hy|lu#LfHFQn0lhYN`CVAQ@!`xY-3~uS}D)@)s!T_&T~s*S9Q3Eut}- zce~{IS*g6zvHXP(%?rl7rX|gVnGSi6HiNu!rInzEc1DMX zV?`GiJ%973xuMosG2v5}725%WH^+1+7po~u8y!gx?l>t4VV6d5boe+MwI6po*!K?uFppd9Y^WE$JdhF=4O8ETA39VUL~+P zgL_W~ujUNgpr&V6zsecBM5A1_ZG?nK>O?Q`=qPH+E9#dQkT8&(tK8%eFgtO~tZW-w zlldK6R#-4g7@qAYn@t2D1(4%%4aPb%x@~lzxP87+Pqdb>q2%MkeWIX$JD-?{+3?m7 zzHVHIV5x6==p(Q|U9asm^vX_)g<)6SnTlPW_=>PHh7@HrA9zf?VMZ%cA27uh_GYq) zM?LO&a=loxIIIY4J^WTK*MX1eaH5Vebz#Yi;}kqeZu4d6*RBzmb;LSD;OhUOCIh z>f~JTnBV}(-n5`1^==<$ivbCO_eOG1g>R!u`K#OZC_2`BBchy4`u6(o;7#wND40H_ z_)~TSLs};{mJ@kO5l@P&X5CG3EfXUR-&0@F$Q@E}qdk2}~HUJvX6{bpOUSE zspI^0!kWz&i!@rduP-ZWz}*LGq4;OP8MV9X{#rFWyAGe5UELlYcYJ-KC~S4@*J{~@-UUKg+@)17lLXHP7zL?&$lZn2n+;RW`KU_mzbnq@OBc-&% zx%At=ym$d*BTiBu#Yn@pfRQhJqH>c<9#L?c)RIa=gI5`q9=zYZoT^{LsiwlLRH=? z`)L`C=Sz9Gb4X1qt>-OD1Nuoec^n|=2RoDp-m=U6aI>CI!yKe%H%A-MHFC_Gc12ah zT{kfV;79H3NkKWxaaa30J`VlZSY1^V8!VAWd+&BRNDeuu^PMgNec}b-p$66HK=0E# zWRypsI59`yrD&KS5It*4bWveUhLP zQ7PT*8}wx zT-_Kmwmjf{o$>?()$KEtMelh-%vjT{3*>{G&=oq&v%vr(ly%1PSEyt+aj&yL&vxM@6^G+;2%+3|l83t}RqqI4yq0}W z{amN5u*A=51J(O_f>KilX#;z~p}dmsE=`^}elZTAezaoqxy9C1$a5Mr2pjO|k>=7D z0{Y6LBPU3+){q>rNz~N9xb4BjhzJPr#tuFcUut;y_H^UP1A?6gstd+_;D+zAT45hh zqw_NX5~F8AOA-S-Fin($*b9ptL}a3_F!$v5j6!49;^+zCYzBLGT0wy0<#=UIaY~(# zAJw~IJMH0|6EIJ2NH_0h5{F(8D+l-1Y0WHNBJ3yzjrn;r#=gRK++~c7d2QvS?g_cU zG$3$`z6r&$bnfvU%p;+tP-LX*yah15i7pQ%c!!3!b2NrML z(31_~+#~OE4k9N!i0LEZd*9nT6SqX=^ZKu-*ety3d!#8liPZgL_bS*r4}3ZPQh~B` zbKi9VEENtCC4)}#r@t)^V=P+;_H-)Ep31w+f_Z{cFiWZd#Ls$Q^2E`Id8ylR)Ey6m z_cq=K+?Cptn|9gWxyWzCwL^C>GBz5Qi;-!;IvLn$jYx#U{n?x-4sT4JMVJkw;CEyg zxV_5I3D?p@f5bbn7DsVHv`{ZiHK-zj6U69NXCinFnGtUP`e31OeCg0A0EwMXFzejC z&71`6N^B=J%<$08b3x;O!`=M>HPcn!J<#lhoq@UkedrnOqLpNwp*sL(feikY4ghO8 zJCTdC!|k;bS?pT*WI}m6PBb7k+vb`fOg+66EIvve$PUr$g?4+vzC9}Vd7>zOjf@CK zXe2U9xPxDpU>&l_JI#z>_=)}U(|C{D!>8kMq7F3|VEok*q`;T=NDvY^gtp%@9-{aV zN5I&yL4!4Y2BE0CfZwG;Y-ILx`NR%eOm{aNtNJ$fwqs28w=yZAm*)Mq${6#u$jg3j z9sO`XU>(PVqX@Nm-R}=gNy&$}2A9J0N_a?`;EIflTFYcfu_J?8H z-ZwpM=tRcQom$(>Ke3%UP_J`!y&^#K3Wu0bUO;`utn_{*zQA*DX!qgAN)#hFfW7o0 z5~Q#&hSXx)C4pczB{`lCtmO*fbigz|?Ym_fAQpbr5yybuw8eP)XgX!_?FAtCn#`{D zO50IPqll?1hXZ*NwDwJq@8#n~G{A>1vHj?K=b_yatm`8Ch*h|=Kowx*B^7GKKk5wX z1j5Z*G?s&GxRNrC(wJcX>%x}Cw#Pz*GnLJgd|RYNpLX${dS(7F>3F!aJ^aPy`@nwKyCVR2Y{s7Oud_?!Wjy?2qQ}5EFj%=iB9g!zqliMNV_R!+5C?+8l zd;BQG=*;hr6+M^2VA-t{=?7`;GEGNymX1C9ZzT-Tkf9*7r_=Isy%~TOYLi!D4^vTRITT@D;6laH~ zr}zCLYFM$o?gLmu)d9cGY9oDer22HJm-2}pj`3MSF}%e@rZshzjWW8vs~)mM1*IL; zn5!FUa=`ih#e>5#$E~oFmzy>ufM@qsB24Y82Z1|1hIy9HE6TOy`3mnnd$*IqEA}b& z0Gs8yWk8E{H_8%35Bh1~`OTVs`03}~Du$a7eoi-Oci28A&vvcRB#S;EfM|hi$>_^Q zw_ndLe6aPmSC{qt4!w1tmjMkitTx0LBoNY)?-ED`RP5)2ZGqxNaF@t^LwccUM73H8 ze)^95Vm?K>Hy+J>T+MwQJ_*b?^E^&tjM8%@IYj+{zLlOU7cDoz_?t_G@eH>01GmS< z33qRPqV)N3jA7{zV#)9Pp)4KJda{7v3=EYtyzxC{cjSB6b%-%&53{#l7#L<7mQf-|H8+sr>oez1izN0J*{_FluNnSNJXEAeil- z`7NKK>{L#6uaR;-M}Q`n#4Yl`KknKso^^^Rr;_d9Ub3mAu_?LcR)C8rcDjG&`Vy4G zQknz0OU#?g>3u_-8U{$bl~cB=olTw1zm>igbzZ$mfF9cN@wzRKruYC7XR4BtZ&H%v zJ8RplcQN_m%!^_*eq=_90R*TUuThSO8}KASfCdN2dwFxsd)*_pjnnuBG}8l2{t9}z z2HYS-oaD_eEjH1Ou}8X%hyYo=!i{8`Hd2P=RcDDG7Ga3>vuPMsrOgzn#i5ACN8JAY zhYdD;0#vEDIv->owz^v0l5MJJ7?u?sBse!h5&LP&;WvgFY)b6p6R$KeB3w7SMzIq7 z$gpfTY#z8aq*qX(j{B>}AF`h-Qzrz` z1I9M9URQF#kOWB+$Mx7&FCPuZF=*b;iizP@uu|9UbHhX<7sD@$Hu9Q197IbO+sxWV zB@@#&3Rdf83RZSjc;uS|zekYySFKX1f>JI@<%eRJ_Pb>H0|HEU?UePx#1BJXQa0(+ zy09bZophqFQrDqU2Mah65dFGFQ|#^}QZD^`mK$`(xhaFXVn5>ARoz1n?mBaS?KLu^`8_IJaLNlDVirZc zy6p25TX3y>qs*yNNHEEbKm>K`>Ff|F`RQbamj2L~jO}SV>J5I673<~SaTtH#rT^qG z{$FB`U#%^E{^b(?zc|^e@%;zkIJ;f8CXTm6G7^0t07YEE0@7 z{RgY~N1yl~F4wt9OozmbIhFt&m*&VMqn z{`H9dgCzWGQvExGfADhuN)i5Xxc=n?|3M#uDYSpGum0mGcm&LUiA!d%Z4^8iU_$c0 zGlg7#NIkG+6g-*#;$UTG`nOEsU(yat@BOue{wq`X$M*_8)4wo-H{xy!Wf*qh>Y7k7Ofv@<#dQ!nZ z#|mCYY+$D<8~B`m)W84m^>TpSvj5~o{cG|4!xR1s69`^G|BVR*+dRSiAoz7LeF1}% z;AQd!yb8dJ=P$GPPyFBCwg2C5(f?Ea?;m&Nzo-AeLj&7F!5-MZO3}Zt>;K2}-+#ed zhE`x(v>AAX{<-J=b2R@m`1NO6{u%tzk{>qf`7fm?<+k6OkskfCkG^!H4X}*a>IF`0;fl{vo_FprDtmG)=F|i= zJB)GGvWm>IX6X2*vXt92Tvhhd<^mVV;P5xjJx|{`NlB7$k{TAoGOgR9Ws9YxANys5 z9xu3V2x~`XJuEtFV00C(yP|Go^>e8OT9}H=Dl5V!!%=5PJqxsfR9-5C)$WOp+QV>X zIrPa->S~ULp0(FX>Jc^Vn*G&2C9uSmXbcHDH~tJURf@ZV_oZ~L_;+v9|K}psKlA&~ zvv#6p`U`VD?zQMprB+@1yA*axN-jYqd~H-z)QC}(0lY6$MulUw_^0=1+{aILbnMVy zKLqvq8fbJG!9!B=E8le*F>!UmcM2QnSNIzE@)5d4{H}!iyq<{1U1^_g#bZ0C+us!8)BGVLl)s0F{3~+qRndjI&>!K+4m|V$i-s3 zdblW;hqVGT1B3%NPh)5ydV4MN5t8G7t{S&{JV~~*i}|IuHVl}B8C-_s!2%8xNdRQu}$={fxi)~*u6>p@-;EUHr3(EUeBkhirnxYHC$}kOr zF4Isuw@~iwOe-@U*}Qpk)e~>R?T!3vmb*J}%i87n;4O&z9OJvqt=@u7rmRQe?p9WT zL3JoV(`yM1gHNTnmQ^wWt%8jcgAO}0)-hjg4)B7>KW@fRc#ZV zRs6k2@zB((n2c;l+xaQ7YXujtP(}N>A(i~qPC`wKea3koHr+e5C3Ex5THch2-x;Nc z3GKXlqrRSN)5Nq%q1Ieha+&hqWojID1}3SxO!-p3>e5`U*j!}8drL@iN z%TR7cdG%&;s_N@c39^hCZpK(3H#>kVpFg{{bXXVRf%~5fO*+~jt)p03(LT`J8;M7H zkwG!Bzcbjf^56VKcEO%lM0+p3Tsq>ZZJnhmr5r2r?TUsgMD=KeQ-m*p;8p6r=EI$T zk$-mxXrUmb9`zuiZ*L~f>BFo__%}0!T=jQYQl+ry#Xw=+Wn4l9hWN)G(YR^nYy~cs zIV@@AkpJ?@B{~M(vt03hdayq|p;MXc{f&$}ioP4ZyW>Nm^M& zSbEn{zcd>g*P5r4P-#bdI5k|ZWCg286>D?nny9|!dNZhJdseB9YTh08d~QB6XK=sM zJI_8B+km*Ct-`NqS7=ZFK`_afPSY!Pb2N|=VRsgEMuqiNbpKQAL8fXZ9g5+AHdE|p zqZV3|nhBOw6yXLRK8=Ng$#h3nhhIr)43)@^t^LJ|%fYl&(VA{#0og4Ng&Ofx3w8J50@^PswgdCi znog&3^Q-FWD%GZ1xkEEUA_aNmSnm9W>b6}`Z(sXylkja%jr4VbNocmLRvbAz_ea9i z%v9m9q5ircNZ5Yijsa;S>6W6lTcPe!g**sCA@lM1-@?84~6hsn9d_@0qfN z(r`iRETt7 z%*^D+nV_^pvvyTq{TrFAz#pjjiJ_7wEY8Ttrl5N12%L$C{k>G)9Rfl!{vyUcireCE z@RlC9N{ztAPXzEo4#(&^`Up2EhG9L*mZ-NTXS#Jiyd3&i2mwSqGolul;4P4?-X!t? zMFf@Kl{XDfXnAw%x``iu;q}j(z)>4BrM#w^Y-dd12Kmo?UaRbTEFKUO1j(iL>0Qt& z6P)RWeNiBF+%PR;Q<>QS_B2z?>4+Nduxq1B9ZXm5=jCOkXoo|mRu5N_7)H2{$e(Vq zdC*!q4n2M?@1Uzpd0LckbN7{0sg4NkzaTpyj4)fO)Ox~mG}&4BXj#*Eqg4t5 z2(oW16SE?_^(*mmH*uW94RaW8=+XTXsa+@LXS9wx@Nd37Cp(^GvdQn+hskj9Cwp+s z2a<#w6wIRPGh51Av;?u~3w#eH`Joz)?krkAA;#DXO4QoNv41aQKo&3+n70ZGhvZGy zuVgdqy!}I7AOvZ}d-A8&)(tIZUU{c>|M$d*>M!Bs4Pi&`54ch6TKOWhIWwIEy)FcN z9ZTl(rcM(k=g%`rZ8{lmGiAfFTVR2n&^KA>9ucM_+4ZkHZtV`Aytw*kX7HXGfYy_} z13SC|hOCm|r9Fy2gP4z`R>zqbG7>Ie))tM}h8zf)+!>JxddQEI#PA&+P#A+zCsHor z*G+s8MF~=wR{M!x?2jD~T6Ua)mC-(5PcV3CUr37%o_^F|;6H-y!afIfu{CFZYN|{3 z-`O-h1O*vhpAin}KRUDTpxG?RT;R#RbICi(A#O@Bb|cs_r9d&^Y0Q_qmo|PT@>)vt zuDg~KPZdc!?GZ|H3&Y<&O4&yZeb_{}`-LvHAE?FjVsfUjko)FedEqvap9UAgFYa(Z zp#$u5=JsvXXEZP)tt3ndS_(r#!f3B3TQryJ9Cl5tYZh7xEP&qff90XUzYnh*krGy5 z9QWq*VII=7h4RHc^Sx<%mL=XgYG+z>hJ*w)&m{V3@*%(Uh=*aIlsLBdx*Ko>a3#Wp zzbp(k(ux?5OsxrqoOasu^=S+5)W0DBcRRmEBlYA|8F6H~hI|-y=MOhHS{VQNCYTeu z{vzI{`ov6?Sm;9}TD)8xVg67i1?kQmE2yJm?a$P-GQXVhDt*38m;53xsk>DHlw}b- z1{t33lYP-5k#c*3yP9^R3dQq-Zu;6>-ya#Co~Y|C@}$`#ot?Uu5r3lxXX%av!;31` zWBn!U3Xs#MmbWlaO=B+cDIf&N0-H%(3D}!bVu+|5-sle5t`<((w-Y2z(DT#4Od2JR zZD))v_A5`I9cpZ0tVFC(Y-MaTNPC7^=WM!}EQfVd1AC10DV)05= zO9(QE-@-)w44`v_X8+1CqHwBiJt>A2`>jZiNY&pVmFo#>C&EGF)6sygB^$+3ZBp%b zAW9%+WiJ?cTbpq`iZ4mJ!8YwZb*txqvGG}lYBpmt2lS#b9<2cYh>-M2n{YV=)=zR@^UJG_L5ef~Ok z|Mba{{7B%$!RM)f@|BiJ#Nwl;pqrz7Tiq#IWEFqvM45i`Hg(nRqNcyQx4WT$`3mu% zM$qriMH)%-m=~DAwKXL$nQE7{@yO0WWk(e zD+(Nq8?gSww$y}y(V@I2{wUsu@F03^vB>^8GUhgFS@aTpiKKMQWM+KSSb9$QQh<8$ zE_puQ^|*Rm@Ljeo@eaBIIb+Q7H_*x$YH76l&@S(`0{c8^V7oO+<>ceZ+skw3{W_t+ z3Z~qT{IfevknE+|rqHA7)yo+7g=yt9x6|65Lfgwn3y*-5`v>2Ooy9h|JF&~4QJ$1z z#&MspH7etnMy2`EVa0pt={$9(8_2o(EF6O->h$f#3Im<=Sr_q`mTkB8&ra2SB)eyW zE488Qw}qnIA%`a z_F7Y?J_5eVm@b{4cbMKSLW?#ZSu(mcF@(z>n&da%9yq*Azon_9l@#4^jBjO6k?a@Q z?bU);872JWb}?ASnNR8#i8OdPORaGG;qo!xv2M))iyv^#p>hbTSlf{B(;=q zZSDhCTayZ^uP z#y^SK|Cl%aiQ4`vZ)62$G~gT$0lXm^96J8V9si>c=zlAR`CE?k-v|Cb#yI~v)_hC2?r{=4jn4T#>1t&XeftI z=)@a;B);c~HV@$;lm*E-ci)@O;FhLroVyUiM{2xZYsiqA_(Cd}O`2_tr@igd5qivN zvS|s5%z0?OUx5CqYNOFjRc?!{s26J`I>*5Z^+<`|IdK4RS5o<17=^-M?xZ)95Vk$N zN_%;5Iz!Iv+E6+WL_?#xC@^n>TwaGu?Q~G|ZcL%H+dQo^u!QCe4rJ<1_it_jw*T`j zuz!M|e~EKE;GLDf&D|C@-8nizk?O(!^gW#*r_)2OB0DIdgIpxllZSfu6J5wB2nAZ` zC*nH8Cz_>(j|IP=^aaJ_#n3Tr2s>=?dM%el^#~Kt$?x3cR@{WAy!J=j!9^x-Ybz~x z%4yuSz#?vy`sApOS?_5+hwD>}rEM+6UhgKX^9pX`Gr7#ovS>F~+JNyL%d=u8I2@ylnu*euLI>~a_^Iq zF#47Jn(-8F;qUWDa0RTt`=%-6ig4Wewph;{OH?aHpn?$JFtfPF@!Y70#lH~eI`hSC z293yhk0*YYVH4Q0$-6}CDXS5x@lQunI)DdBF$O6G)uuU4%ewj}%cC5!0Z;P`x3iem zOEUEC@z>YrH%HTEyxB8}*e6K`BJyBr0`WsRuk6*urD_Do=01dE?Z!$7n^2TuLysPr z(zTW|=2@{umF22s&Cq?ZW;BzYTGTKa_`(>A8z~sJ+0s(lQIgy_@j3ZfjuK zq(=CdpeW8SUTvD@n9DFyJUUG!#Ju9gyw4hh670c@Tqu_%_boNC+>k78Ek)=mGD2^Zx=K1kJ}!D9IGz_Rsire*Q{i5=zg=SS6)5q8r}#q>0T==UF!u~I z+H+SUbVY%VjlLQw+F7U~KKmbGLlvrOQ>o|8s|S4vc{Vhpl_posqaWQq$sETx?4Y0A zJ+i-q6`Rh&R};IE$DQWG9k`(TCqIe=gB^}IB!d=Qva5^&GpBPT(`z=7Vx{F-cs0p& zlc-tV`wRggYeCfdZ9gbZ(e8qUHytjLMq}h?^LUiU&59@pESDn>joz%kXFx0NHLCo^ zOX2t_VZJwT)BYm-UN$w_uUv{^=;tD#PJ1f7b(GUE+%~$qP68l7z)YrQS3P;|+fgZ& z8H+-ZA?W0uKi4CJa$oEh#!0GG2@^Z6Mwn$EVjs1t*4-%aWy|gK*?1k-lQxzLeYv`l zibC!5vzPv4AiN0-`GbQ zlA+feXYA7JJkDTdu|aZ`-P@|rkr>~LR!P~I`N5@~G`oCW203%9(g^I*UNg&@Hc%Xh z+%m#3yG|6*tlShZ%RkdL@7Xm!L*`Jvswp=+eckrR`Qqx)`dau}$Ta6Xd*6d~ki4OO zSoG_qdu4W|pRb2+#v}KM{f^;IK@Pp<>YHmwdLBNB?N^PQ9W%Mf(yD4=M%3Z%i9K88 z66BZ(<;HJx3GEm>hKL5~#RZMd@2KPhS1RLMMO#jRhy zJxT(w9eBcG_ZkmFU@mit8C2fK@I~!Osf+Q+xW%o`t>RdDV6G-F@+p&%!RF!AG8FX({|@JVS8%(Bxv zN`!OT)!D-O5+f%(c4vQ;x=_}3yH|{)Mq<`aQ2rvYM3*Y^l&UVk6CHEY&xods#C0*^ zQKMLk>E1~Oy@bxmqT61>w8?c%_d5p+U2~Xn8YUKx7)+dgPt-fhhG+HS7o~3(MR}?5 zm}xvjP_x=Kt(ITAGc9;1IKYZ#?QIEwovyyFSjxCKWxckQZ}EP7ji(sY+a#r zwlwjFP(N}V5uk-yBoq4y%lxRh+N?TGywdE@SYq#*A3+4FH&2<%b1054q^5r!{q3=8 zB=h?N#^zQsYlP)Yh0sglzN>=6k%*}CGK27ZPDWkhYs)8{FSS$!`HxRG=KY>;&R_cG zbunQ)ba!Us;U7a{6U*ynC4 zK7sQr*$78{cQQl-<7CJD>E3QDZ~NXSYaoGOB!wrJuAT~RFm670_gC_VOp7>~i8!KP#TM>LjqEj3s;iDVi$z1Y$?~z^>lbu*JuQUf33LWr zX5@t1xH>5~JP4h}LVE~);?Nc2B^Si)^vT>*)|Xf$F*ewm2#rqs^u)&ggxnKH?efjg zWxy2+p9#=_+Bo>xA4aJ1o!Vmqg(hvW=kMpn5$fb!MA7@|XQiP$tW5I{Lq4p>ecK^^ zejQd=uo4vAvk|MtZTe(k#8lGq6^uviG^*@SMp2a%qD;`?Xn@#|W&YfcIXRHm5< zxuqLpIh!v7Rq9LDKRi4Mllrs;^T+0NL~=Xq_2--_vsd!;oKs^4VbO_3N+XN@#EI*@ z{>k5uicihtsk7+x4*48#a|cSjVDFFna79%4`zA+y7VWgb*VMF=-SWq`<;8ua*viOD zC)0UHtK91+H652vCSUboV7POzU=7g& zkL|zei`j`6Vki=|4xq}cN?4Tj!SWT(zGWHUbBv@-u~6Z^NuQI`@&XzJ0Q-qhI#z1Y zD|HWKW)CIjr?Yx>v5y@1N^Eu-`?PV?YBHcl8aX46|b5N*TTeF&P?0*LG!?eGhoC zgbL3`hZ$@;{utgfAl=z6X|LlXs;B?Oi(=Raav#`R)mr-w!_tYAza;Ghw5yjzOFMk^ zbU73<)Tf%9i}Bos3s|&z@Nv(y_-+-Q%^A0w9v9r%qT+8-Or1pu>z`9%*ZzaQhWBGU zDv44B53HO24jBSXPSoNzHUAxYwC@!J8c|-GlDo{m`S@?;+XO$;=7>>!l_{R&k|UoX!Qljy(>RQ^RA9Sz zU8cn#B@;JoM4YCfQhRhzRMaSDto^X`S>l{TYzXcV4uvaz`xS z$@u=zj^QLKcIVoBJXeJWrdHGbVr#zD>FPDecCxkoirjUErtKj+-ZHU`J7>OZdTCsn zNI1R8jFMUz32S^M{}5p)B6E64ZVgkIz5NkGWxquKIEU#1X* z9%g*9`YHE-xmc*jEmndmjg9elE8VuF0w0`7_L0M&K3J@N)aCt>`VzG=Y8%~0Co4Aj z0e|aaO4=aS1^c0t>2~_3F5hy>O{7U^G2E!v8)?r{F@}QyUXR)qNYcC$0I`A5se!5v z@kY_m|4CbHiD*MR+=Gs49jcveO3ie>KKGW_VODAcRX*Ws4j06a5Z(ZQ}(ao|i@BO^w>o_N%z+Hp+!@otA`=rMb`t8&8 z(U<(WN@Pm8`@-CkHB_mi*jFhA(|l6!0|(}Lr;6B3-V7OxL{n)#(Xb5NoZcvB=$%2Yil8+s+-$R|5(Np7FatNp`RA>NMCl31yP1x4kNPwU`i zUQ=$a&@7}fA0Xk;h-8NJwVrTj3&n8vqCzwxXD8z*qd!#5xX~SD$5Bs9$jt&!Q!i%OEkblYHMWdJdBpUHR2gqqzhmXb$1?mkmHy`07NrM_Wm;}uaiF*0lQ zs%U-&s(MIUPJ_q1us5H3jN9y29^S3{ZB(*#TW0etH@9RQV^?h&Hoo!1St7KG3kS#m z?IhSvZa#Ek_s9h65WSqf+xZ;{?yzOnqYqWYJd8F*t=q&{3wCK#P&&ytEN`@oky?u+xY3aW%U;KpuYq^@F8zpl2qV;G6W3vi#8;Z`+gGb> zGhYb4qHCWm*+2>ZIvY81j$gM312(QH%Bc`+6-!&Bv)DBJ7U*+&R){$VA-Q@(>Gv`0 zxMZfReG?Q;)RuPPJ&B>Y^x|83Jp_Y>LM`uVwbz8(JMYya8cn%&2^Aj&`gpS~g*|rl zwBc$LYbM!4SR^L{rd3w#f0;*U^=*kQNfwrnmAQREt>&{ujXN-nJxZ1>?$kggsK>h0 zf?uV_Z6{)KRQ=dCpI#5|QVs#!GOD2OW`~Nzaq_}%l8hq~EJCu#{|Gp-2jw7V>`ZJ) zkMo%Xets9KUNO0VwEt7+0cYWdq0ogs9ZgIv?jzBap|LA8*9q8(e?lq!qID9^7VCyR6OL$n>}nw_=^qe!RDrd*b1*&+JC zZfNeiFIw5V?CUs+DAb9z}uAcG(^0S1hM26X<3_2;$Z8oZ}a0`hfEMOz}DD)wc|--i1nF zh6Gfea5+rGT1G+0S1=dIQ-~@&?!}9Cct@DQk2Ej!-tEA>{dNEImjIB-QG@hEXZ+ri zmT%!_;W|Mz3{I3jLirN`A1)|eH7tRUM`q_^=55*Id9TBBhS=eP$ug}*ty+ig4buX$ z?S8kYLpT$tN`MLF3F3@yAvn&O-9%)f`b=J9hQ=uAfrWkNO4c_WsJ<~FU4iqI^_xH!|NZOw4oBM8q|y%7hBruEK7lLMi9MnDudj$b z%`n$6w9m(!_j^#|U2mVC=OJKh)==;fLD(kmSp}cjkm|(7mI7br5B%>)+GD$nl7j49 zA4U*=dF~T$;wa?{FbWCZ;mf}1gv@cWywCj3qdR_0dfcWv+B!adVoMnET2ch>k>i5R zZm3n4y%=x!q_!BzuJXiT*#AiA>9~GvkO4Yb2tAed=jB^f?6p>2%s86M4jd9PnXE>GxG~;)AiELqnfdXan0rEI;(uX;-gk{n0_E=q-*P*Ax!ZCVr)j+qP#m*K)@i07*IQ#$q1aL*{8!-9i-haM{MA z1+ruxe+)q~|2pEE73T*`@++K~#a}cwJHqgpx33|HTIJrLKbf}XCtmU*&OYU_fuL(} zTi}-Kr=ZJNZ#)fis>?2uoxQ6C_=lwn_@`@*2+c4GF2i%Z-HDZ0bSTp)QOyrH4he_2 z`&;}hec9TOsOnZ#6W4U^Tc+Jd`R`Ko%P;JU=btC+A1hgvHM~54_7qR=_idXbbr2RT z3DvPF8Eb|YLl?hBSVG;IILsNho^60_&TOmQJB5y~nm4L|Qx-A=1J2~9wdGQcv%RWo zm#75Ha^Sm?YOx#7i17gL$xHfy@_yV_Yy%kXg0viv!EZ+!}2~t4yLxXHZ)=Dy(1AOw-*car5gH_vT8k z(IFm58qi;{i%!{9EU^SzOpcElBHY>M)8Fe6mnL|KnhiGr^LU*g9@PUc!-!_SrocZL zlI%8-U#~!1o3ql-@#h%1Iwcp(=IaLxu%L!A3h5QHY~!rA!??mdKcC*oG!40w8>Puj z~?yPSr8qB*?`;;Z$j- z83x2hI9GZ@+-dmH^eXRl_Lg+H6A1PEtfLSST5BIF@3ppRJY@Tgs-yi=4Phd%BC^yK z&uzM2=uN-;4(N;W&i_{B1>NeHab&RWjj?XI72>vi*8}2q$S59p#D(I_JHh*vsG|); z=!&8OC-}R>4OfR2jhgP2hncWr%xrb;maEIMQ`!2MO=5{rL|`9$*2NZ^k!`=Om{bVzf>`gqu{7R zPhiu#!zCm=a}~mBgLQ#nif31ON0b8-I$6$J82I@*Qd}U0goQo$zeA`NBClVVJU^c>UxLKjQ80L$51J*b5kNt8t3&fa&H8mztQ<>-Ekue zr8wAMmU%jz<%zBf=%XP_AozcL7puBb6aWb_Y19H`R2aL>>^8qr?8C2kPIyVMYkb=<2l=U<% z!-RM!HHXLTuSW&2{pYwSBI*S_Qu&VAy(i4OeCh7h0{zsp%>h5-@u#7RS}Aa?r4`E9i2v0L!G zW!bC*ID7RIAY1{FVqKzPt~?2JKno)i?{25OJjJL+05qESo>~z{LfeSWAT=A3ij3no zASB3Z8MiYXHl)d!{#mVKNVh=_^y8p|6;HOA=VZSI@$i;o#!b>8Ded{>*W0w`v)t28 zyUH1m)=g%?>6&oJ=}r@ej^j~m1BgANm0UVA5rBB|h*W{hX|lxt+b0RNrZPn#1f(c@ z3YoPXoA#YuNMW{U_jv2u@iE+ssP2PYd#M!k$1x+hBbRzxrZ06=ymO>mI>MMC8)`G) zSxkRY7eUIDx#d%19H;-`|pca!;YHbNkJp+?I@x;)OgvcXZdajyfJ&AgTb>pY3<- zcC}A*3i|OvRPUsq%>aBNz!1I{<&gO{5zb>6&$XQJd}(c#nzD-0`>|2pr#3nuUkjWW zDZ&Y}$daOg`_&76n<{hx57)9BqXDw+LxPICFRkcUs)_l&zz zD@RXE4=F$!$^TgXN)Dtx;R*h8RWtN5BCGHc+bmzE&-314fv0j^2X4fV2Za@%>K!q} zt|Dcx05l&LvWwvK1^{`cV|5kcOpKtI&ZYrmeU_!a(9bHj`NKw`(phLnrNoew zZ$RQWmh}gwddZ0+sXP4UN$u^Kds)XN%}8`GC#8Y=H<#$&)Z?>gKlDzSm`C|G?GH;M zGavV-sWK2BA1hh`x;%&Pz2=zby8-e6tB5wMzn5m5ee&=Vlz6u2j(wdn;V290FZfoh zf6CL&qm>v$VjL+ldfhqxe&DZcl5!v6Jx=xDvB8qJ=#iSi@ep?hbZ6QTyg5VM5Eagm z9dU$Pv|3ivF*^W2_~t`n4to>&VOzWzK#w!GsvA$F|BJV`0E(;Yvwi~vcXtSZkl@}( zf(8rjjk|krYnBGFojbSYR#)w+eY&c<_BnfZ|IgXK zwHDw@Z-(`XqE#Nhlr+QFh+8>jQ-;{SF56+$8IuaBdl}-x1S{l^#rF*!ezoSs06 zz%vj7?k(pmWP0oJpc#q}?2*;`Sny`2Fm+mQ*zaB#+RD^4y%Hw5GQ&0YO1Dui;!Se?*rvxsT<54>myRwvXmX92O!3<0xA%^kyhl ze-UyL)jFxp5=n)hPE#670kaFa#ed7xd#>Fn$yfK~@EXXSC81jL3-hz_VY<^oJuR5Y zNICufMv&)E&3-G|5#kH<*LWuDS>zYFB-mQ(&>YmRwi>dcxFQVH9%QW9v;=RaG1;^= zcc@Rwl?^-=ajYGetvX{msyOGq=HJjgV~hg&xxcP)+W?>a`KnisL&rf-ZDSh{$D6C! zMzxsqFvClS{Xt_`#)F+0B3#XsH5_RW`nbb{rh?n`fJ)ET>TEgEz z^Mp^O_>lYU3T5HUqaDF&!)pq=*cGBDAzHqZu??D4k}N)RHDJ{+KMx{~WEHiL-lNim zCUQMLdWQnn3snzA{4O!ZU{p$%;_Wd?o)ENpY{=KIxku8pDzw!GiyZrdUso?8S`}QK z!x(G!va<0;_HBZ%Yr1)~jH{MR)M|`0LIL7Gn2d-B;~`Aos#$_~%P`-uPQ`wHg&K^l zyl{1TSg4{#OBn%+%3KlihhMEJGD5K_S^5_#kL>c8%KW@+3Mm&<93Lv4)IE&?`YW8K zOEMu8_TEoXbujKw(w;%5@E0)ZUu11lHB?Gui*ze97aPlxF>n$ z(QHDFO=C>I60yrv#Mvl4%@YyDhh+Q`<;~?N;G0&n(2kxQ&?xFrzREZ#iz7ci%X8&4 zc8U|DK@gAI`eMf?!KaHL_kI{!)0Q;wkSiX#9){TG!#jp9Hn_6^zjllEp9cupp95r@oI?--nJe%k~;iUcT>pQ}BU0o7e;`JDJqstI|8yKn!6w)gUD!M&;5%TO2 zG9;->I0bOFH=pP4n-&trZ-6_%??Hisl*jdLv4b+_&Ys9=4V8|!HWlx$)?(+cBY(_n z&nX>;xf<@Gd0gIWx+o1^%hsFj9|K&P&WuEdh{q*3v*N7<=vD-pT725Baz~At+j69d zYv!8-)DHoxle{@sYVtf%an^WwvBLY%@=fA#H+Xr#;7NA*6e@|-;7RW9_;RhGNv{JmW);x;McI1Ev#( zdmgMi{;e)#P*U5-0{wH#SP?e8HTMw!!h|-^*i~}!^Hylk-N&X45jXPj2|*^IaBbm_ zOdowrzp=f|2)^~VdCn-#Db``5IA;3y3FOkHc$*|ef70GljnsrTXyKOnY3+3!#WMXF z@(Oy}ScOmNrRw?Jb2Vixi8V3Oy>4vGwh&Il*&DZ#1u5{%3tGJXKHcUSloK?J;9AlW zo{~dpUB7RI%MZ9#38}>`8gpSY=xLpn3Y-lli=$N#XjTlqaL%SepBnn|!x~1qLYyeT zbON`?s?}YXH==gv(O|M~bu55_6Mz^7hC*#zjeI6L{^!5cyl}N{K`!FiS?n3 zllzMFO7Mwj23!V#EU&DV?C-wKpRNv(S5XD>d9EL$dc8KPq9vcR#a@2Yf6!_)pyHo- zmF@6CouV{*2TN>z&+6rp(2stSDt?dc(Qz91NGGy2&LY^A`NfR0oO`qFv>X@EWICcC+lsU)cn{3)>=p;^t^uGnbk7^IK( zl(~M=zn*V@^g5x+`qE%`64#)+|30(ALqGc&n5-B|EtS^k@bp_Z+e<;lHiy3txN30( z>)U5%%j6})9X>)TFUJR9rYXn)W&z8+JY+T^Am14eRp=^kXI!-aThPI%8{LeXEe}r3 z0=ArJCqu4|n=MU0wLvf-W$oV%9^!(Hfz?RhK6Qt{Qfx#JgM<|0usI1^Nu^|ev3wRLt7uhXwZuLy!76bM;haF+&?4d_2GNGwbn94%680!!@+FYOYC55!!MzXp&O%+D$1QW`Dt`3Bi%T- zI38@Nq&(6ayED79D!zqTC(ot2Vq(^<-#>e`N`qZ-;s#Bef&|!D0ws`Yb-_=VSc)A8 zkFtcPJ`6oW-+xfhs(zdl4vzh$byUM%{nYK~#JuaaWK`cnlB&*vnb&O=bAZmNqV5A`oNoar{{bF-3)%Uv;L*21{!@7LZ`ko4;nBZT!T&mp{~8|s z>m>an{s>YWvVKbdX*r~GkwW(omxwcL$H4ggW&QhNjy1pcM_`0^3YO{pdK-EzWkD^B zus0dFB3aA)a(87;i@-SgH7s{&ivEG(PQ2ipgfut;?R$S5o~P|)$lZQ~)8Iv1OIw;T z&xsJf@lsj8*QZ^x2>kQ2t-c6|;eF&D4cf*BvX^hC*sGZZ`%I)X!$QVZbl$HPY@|8b^}-z>nn$p(p=LnpXxFxAYI+s~VdxBL&xUMKt1Hh(j;Skg9$nd%8Pg)j0vfMRG-VItIA10@1aWpO52 z0^pi56xWb@HY`Qyh9=BG4*bEydmJUKpck7TAk!k>BcycPn_sKTQ$R@#^#dfFC~E zdC^4WV)923l-VDw#cwxm#s^0IjZ96 zAzVR9r??o@JlddK=F#x}$x?n>2>;Cs|F$mv?+btIX6B%zTzAHOt74Qv8(%D;5TCLu zp=>UWTJ-yul95dRdp$HN3PeO7=J!#8!ad1^!U7-O<9za_8uxx*dyIEgfd9D0f)aJ5*L$b|c2aJ1uvP!Xz}sscCoptAML zQRynf54Y_0BOpUz*BH6vY6h(j#4Q=FNxU9lvI>6DGGty~QwO--+IB9A$y{jv2Sc>L zsg3ci5)Sj)UNTQxo=$Yu@~|=dv->JSmMRvODps^UJkK4GjIs6GltAAOw^I3jp7a~@ zU{#-=AZ8b1Mnz{>3#6|z<}jmE=FQ8)nL-gg>e3l%xn^U1jP5TRqknKkwSKTN^aD%q za2!qshS>g0kblXOTK#if({D&C#p}1$yC5_nCEGDfQuUrP%&ixq&jGHWAU`V!qG12B zr6)9F-ptD{)X-)8i3-Vb!#d!b6pe?g-r$A>7>i!zQrx3lVn&hB5?C}`R5UP9RD>5_ zO)0mKtFYZ0Nu})GYLYOrq`02Jh=a)TYskU5W3_Rm0JN&^K~t$vz)3h=3`hizohRgu z?OQ4qq2T%HDQ6vq&0*INh7;4#J`k64UICRXnTf0fWhXxD9bxo)!uDqbqQu4oR;-kh z?S7}ko~QnTd!|oa2qfu5UBRc5aEnL6oqgNeA3Z;{(f_5r3~b_IVwUaS)Juz|D72ri zn?CI_&ei+w*pr-)akEjZhJMh^>hW&ZkWM9m8xT*AY+ZxzOowa!aZFM`nolkk;i6`U ztNL8Djg%-$n`A{|4V(6N1ZfBP)mkaPkWaGy6>Hd8d(hnyr6tLObN1%+%*3NkRvk&l z8Z>KdzTR!5yH9;90c^s;Hdw(f*76fCjrI|V`{vsDcY(>6$&55K zXa>3JD<^hkKQo+4of>(j&2t6IPR<=OzBoLR5l1SktQZV}4N9j>@L*(@ ze7tmo^~*?L#o$KZfC)BV93GC;ii3Gc76r?o5lN64s93p>@BSdXIFR*+;zysWEqABo z1DRzvjBn1WG@9YzREHulp8%D%p7WgYET0!Uapmg{x5n+ige%P8o-x6dmyYpp>~e_!TEhh3 zvrDb=qn%o;gOJXdbJP*&*q>ngv|o3Ti(WN-V-=Y^_Ve#Q4^Aaf9rt-8evNGS_B10sBqaGzsjad47}vG&fsT_bOG>p!&M6U=2L~g$9eUPRv{^7WzS6pPjNpum zFxuv2%>SKs@9)RU{n&dRt@_;GOq_GLuUNbca@V4BrQp0gFcEaeCThaAt1%N5k#r#M zc$rG-A5UqHgg6wYG*?=JG(KKA4~LZ2G0U5EToJG^w@|#qf>!mqu3*-%KFg_@1!#GP zeoSH}qzCMAvY=n3TA;viQ%lbD$x)cREWyztTL>gtHVWNE7 zc!h-8L^dO!9zr2yh`Ed zzMQalJS*?ad5~SE&b2R;t0aIuypb1muVEOY=HNwqy)8Cr_f&<@8ke80@4Nl&nbL_5 zB7S^xw&Y^TcTbAnyIy|yD%Z|L`}-9d$oY&r%!a(Pz@5RO`@<613wK (3*EOlJK z6$H~}aBv3B9JeU!jH00`7!(2)t|^{&n2t4|cZ-O^U@|+9*`I@+XhFYvDG%pt7OfCK)&WpQ4Pi!g3kAL zATykVNQ`9Y=T?(6x){GS{)lKoMFXl`u?dzhSjLG2!xlrD4C+Oa%yzVVT0@-I@vcqw zykLLAmgwJG?13tp2CQ?Ec?k{?BD6)EyDD3IEBgZjIDvBx$FGJ9^x8P)}Q$xTLgh>4T(buHQRWQp<&7`ZHg0k_vIhlD1Y<6H}~UtNH< zX1^YYFMzcrCHJX8n#n|{SEH42ZldnyGs({a(W7P!eAEV{ZU z|F<>(TvT#%X@f!gHZ4gsCPRuj z6_&kQlNx=pF?Ca%^d75ZRPl%pEvzK`;; z%e15^0?6FF*O!!g%A+L%R9CMCeAP%yOBl2xNA5vG8JE?Mt}5(L%9Hg8SF|f8+wsgS z$~Wm+=0-rtBiMOy6_GB757mP?neE@cyi}xL==6}WOZf4@PI57lixl?stL8=|j0&k$ z@+?C<5=+3s6_g6Sh&-%Jp8$Rd!7c$WS?Bz8c$TiVQ8ntf<2S<1;6Zv!0Y+pAR>|j>J9OK|2Y+>;jr$9Km{@WZ@ zZ86S)@djT?H3{uYLev`_+A2S7hGQbu0H`vrE3?f|`WQZ+Xp^U=9+murLRw&Z3APi) z`r!^cKRZ522}mWR``rS4|E{VW6$4D97f+A1P~cL=yI7a+Ld~?8$r*PC8oMoP+PaO8 zyL;Li-DM*5;Vj9QKAiGLEvhhHG;hdef8wa{yBsMjb_1b$q2B2!Dk{$qYur0Fr;q;8 zFR*teZ6mXO_H6i6!`oL==^>zj1}&Cky?N&Ki!16~29|zK-u1HhFPf$C&Esze@1`8Q zpK%#yLoe^o{igen^6Yxp?bDUwhBnmmxZ{5<;?m~%nq~jFE1n*<{BRXM({epWM@I~1 zoeImT&8~29Z1$d&(sqL+g(NmNETgn_5e9oQ&`(g6Ddvzbgg%GLmyd7r)XnqhP*#~l zq=jrI2RxuT&nYq{x(jDRE1RRyH##fQ+LOeN68hXlJMbVyGa2N2wR=EZ1cW$QDT zV@zag-?X!e8TT4U)0BY=a=}-{qmIvmHQn}MdM83Of)nT+CvaOvin!jeLANgxUtZt6 zdeD!>U^2s)LAeY%FtYvB zs_e=@RF=;1hmQ8!)RNuuF(acb*Ta$p-gXdM&!05z0NBgl3fn<_+eD@|d|P&|p=l9* zBOK{iKL81asSQ{nqQ=Hng8Qw;lG#5hbTlxJrVGL zKcpJqOFrpq<%v`E1SUJxWi~dlIlKj-l*{Ae;}1zLbV?Y44cMw#i9tA2@~W56r8Iht z(O=R^aMNf)zgS}ntT|pt6w8-dUM>;~f5iOx(3g@x5?J~5p*1HeF`ZJOA2^mzhGM1u znl@yVsVFN$D{xB6d>gkCX!w=PHQ1Fiy{qNkg5XJpSv(aApO#=2?42rOdpZx&eaSEh z$m!tCAtUp9S4w^u#QwdDwKe6TT>ErFs&huk^LM&WkVEOnV za%0?q)sAFpa^pF`vw~JII(b{MK()Gi-$<(Vdx=(-$ukAcd`m!+-C>{>;c6NQuk^9W zCrn!Sej373?A7`xtB=1lee30y#GK_*g^2ekAI}R6utMU)t`v@pVq5TC~!UJ8^y~8uW@kFR9k${>_xJbu9WZ?;)ZaZj*gN& zMXSg&>D5oKtr;_}8@e`%&RjB5bS6vW4;_(ugv3r96t!(!_bSk#aglo=IbQE*&Be8_ zT4x)Wo`fClfnSu@1IfM|Pnm@i`X_E5>W@2wL?RybRYxSn}gg+MfmCKR!@4!|IT-l?C`C4*6D&`GXXBspUolPXlNG9 z`?WL_T05droljRt;Y(F4jH+HefTyr+6;kN|WJ!VkW;uGsv zb*fl6=gjLDZfaemIfSy4?yegv4J)$0DMjEx<2xG{B$1N~2osZ&j)=t_KWp+2;n+4m z?sAg<#J%eg(kolw0Mf<-?^^t<7(RnmB?6QHW3cWZutA$l(ZSVSaO*FIIk0NpXlQ3dvI)8^`wQJnszLeP zprP&AoSK^Ve5P%c7>sv6!rv&Sv59)z1^{q)#kgvoKEEZ#l;Yz!Va2LFRl*uO_Qlp19eh z+_Q9GczN7tTGWEb1Q1rcBS9l7o2nPa^vD##%xHmL#Fp6~rhIM4i4Ki@bo>}{6vV&N zUz(1+JUrx`MLSEQ0yjMg=0vop^>~xc^!nHPuR-*#P<^#=4;m+`PaZ!_hdM>ZzSbRcl zh{|9}sE1hUu2(h9_6tAb9ue(qqML&Yg`3Hl~=EAF1n^;l= zA6KFZlxiDE=t_yyl848WVkw5O&LovnlHI}`8>cYyv)MP+omYlN6B6r!Rgel6LIT|i zp4S2uPu!fX4;6b}00R>87Ha#AdAxc=GAtES5lQZSvXR-tingZkU?Q8xu;)y8NCg+h zB|^s{yz0eIda^hxjL~z*+AEn&WaDi+`XO4kHeA{VaNx_w-tQOYRrZ=e8|^FR+_*Io(-lIr~yro`^c62ezIni*3r z7|XwIH!hiv2%7X z7-w~BKE%z+U(Gj@pyP4h@LT_I)3D3g%#dMbP-4~9mHj&5rpeQ9Z-afY`tj7PPo!Ua zYtT^ITMEWG9;i!FrrSuJ;(Lp#4ah)j4)fLpi(-KlT935pmm?fI*}UCLqq#seS{2le zR%9454U8c^+1qgI0y0Kgq%tpDn`J-FmM7H2OJ`qRp1Obl3QFvXPaG#d%{PL_+w%!F z(NDXUn0o8Ry~114(P~fem4?9CP&a4jr+oxJwuAZ6Uh8COq3GA`(pB^H>~S@f>EnBr z7cV=+++;H5I{oV_PP+F^OO^yGdZ}QcnhHj+~oZ4H7;=qf3qWp{O#3AJ;6(9dQfbYgk4 ziT&mtQ4b=eph|`DBgST$vi&%Of-7c`@;H)8zfoHS8n^A(>bSdA5nb9YHhfvFK%Pk?qwpYzmxaI^>vh^ zT@VF)BqzY1!+XH6+$fU=T*^tgy5#Sn8@J-VEP0P+kvSk}2{Gk5)sUtKJZ}(Q5xJ2Z z3rl9%Ks5!;3O%>tnrBO}BOh7dZWU9wG~@Dyp=DVy|KV{XNA0R5JmJjR9S3M45_}zp zO&UV^+J-d`JpQF6hCQqjuI;U?l4Wm7PL=+Vr2AvquQz8^Q3XQ*$Nu++Y5r>ES%U36 z5jQ#U;&5!Xa-54{c$;JA^#XIR9aeY55@@pLckWMAwr7a*I;1ajS6%^YolJiuEW2Gd zcF~Ud3eNac>4NGvtT5Ezd;&uY(NzfFo6GoQws~s{GG)tO!2+^0x3Yv?P|jv^8{04S zmr)Ph1!O#3+{i&t(}1V<;!u>(fxX(W!7>394v20`T~UMySCJGS?-3dVJ26#R5kouY z?7Tk-Joi)@f0VUqE8Luz_qa6>WB3~KAaa(RFU1hQwH zlP2AD!|#b)y?Xz0m*@2)Mcv#LT|;pU=fg^28_ah|l1vsfr;f^mp}*jo*P}TQDBI(E z+gvDvaV-;+&Ba`=duN*JCnK=NS{dV6`A4O41crI^ed8k^m8gashA~&fpd6YmS>TE< z6tAC5uJk*V8e~fs+%qkd2HD(QJrZ~-7wD7pJ^4F#Xh$D*86VPR_y@XoPuRBgSTo@u zL?XG|$7lpMw=83v4v)aBhh$hApf|c>u3Kd-=fr6TS&BShIk7WMQn${`lx3Y>__2e3 zHphA53q%5!r63vk>`8MT4$;J9La{$cQ$UO9td3|e3H@*vX4`!{Q`d;Yjr6w{;xm%v z>(4BXcU)!6&hVuQ=&h$&DC>^g;^y$Czu_z`XdW8T?`13Rxp0Ag33L-C>$32AXvBB! zEKTsd$j-3+5s<)S47unK?sXqCE=>1x^af1$58IH?wyu&Qf@SCoBq($$p(FdHdE9kd z1eG;RYgB{Ga4qs?Rsg{1be>^7TDVrSo#MQjEw-u@?#K)8%0*yOLl8SrumUJx&E&g# z{VNk7P~sQe4gN^Rjj)NjanC69`ZLMm>#!q<@uAPXgdU_zuQjHj9=o&#KO+NvusksL z&(!!7tjZl;n8hzOv`Ai}jDWBYPKl_PMrhwK%fIRRm~MJCNQBgo@j3;Bh<4ue>*Y8ztG<#Z*{a()h50F zmH6iBP>+_Hi0IL6@bcrCH+g-Tt``5?H-UTMW@5uvcH5uOTg$jRerLjqB2moIuvIhQ zFQ6kB@9}wt@s~us#weCM~ws zX&dYRZ3{Dwf3Z(z`u;;$$wuslu(Y1-8@^#^WbI();QEGj0N=2aH=yHRBNP7WXuZAu zgJap>pYWf@`rmLY`^Vw^3wXkT#3X8C?I0v%vJ0#-02f$?&h- z$=}`o0G_;goBdbd$=kdC96b3qEA)SJE&FfVsK2o||2Kc26+LGD=33T!%GX0uA1D%Q zRPT-;w4l(R?PasVUcp_=YlbmXSsb54IU%e^t()B(o6iO6ers-IZBIYeQ%b|X+ZTL95)gpb!XYoSIH|s5jUv5fpj5;c47B(2(PZa%K z=rZm*tHX|tc2qGrf_=xe3&i*;F+?NtAnp<2=eI#2-sq2$0W!FlSfofZzCjMxK6_HX zgWx_Y?m-gR#T#8yUBNfTLE8Gm+BmPFh)b7cT}$|| z?7hCM^VvJC-L{_g*=N{~B1=jMV;6cW8J6fQU2oNmFdQ;hM~j0>^c+`colvDHCoS8P zN}yn3WzXraWOMkz**~9u)dRHe@E%UF&0ZVp_Jf;3LUE0R6+LNgi4OtnCFr(j6O{a2 zPyRczGb5;rHn-WY*><#wkypw2-;St~gTCdcm!xOSP=88cFb0b-6z4j!IqC4PR&HhIa!_&1&37vLDX6D7+ zh9ywDcFn7iqx`yht&MALV_wT>bZ%ci%Z*LR=3TRJHRfBjKBUUZB~q|Sx11w-jjzQj zHj_>1_4A~Xw76f*OhzU%{~>oy%eJX?+jk5IsrBM3`e8{e=g`w9?Wf)yxrX$ z!@3xs+@h+YmIN7og=0|FP+*eyrQmv?xLyuo`Y*;ZXfEI@==3% zIu^b~MA!q&Ol7$bDiLxtYcJmaOsLiT~{$Cc`&XeG5>bU04=LnBu* z`oC^dcx#_SH1@*Ffsg=t_i%6M+N!86(PPo&1G8*Wv)xATi0fm zL;qM(z=bto(RJix4(C<~8~y6dm$TxKEHP1Duv-Mn~;(M8Rk7%25$tyI@ zGxR%YPi)}Knk&<>5${MIw?}}%y&|Odv^sKXA@|CHJJK0OM!S9NClfDJ+s!zpx}@0t zha2*&)A16Aw9(1HEABoytoY*hFa8<%CheYxwddwM!bN9H$;JC`Don&p7+t>@LCYA= zQ0$W)?M1%{B3L$@yi+l``wvG+nB6%n^guqV_K(6wp;*ZuP$c|MvY39Ckq2;vtUuMk ze}IWb{1|W$xA#k*yj0{DJAG@I>RJzcC9%YlJ1((Msb3*^S4gwfR_~__-zS6|c!S5j zam-0TSSL#e8lgv$yKOI9e~KYwG}Z-OE0+mo;|b43UvuqBkw=iTBR5dO)!X`PVZ+a7 zXCvSH{>Dp5q*O(%gDyDXBea$P_C+}Y3y3H=3K~=|w4lxN!$uC84=6ZbEep-ruy%7#t$vf8eUJ=*ANb=2To|F)6 z%xN`Y)VN?pqm0x&oH9h!UpDX%gcg1~hb_*a)yw`X&>4u_$}e#$yM!2GB#(RHPG${{ zIXtUa!0r!j3v7H|*F+%;?u}Z-1bk)?#H&<;%9ww?>uTeIF48Sx1ohZh0x!rb4#mbr6Y^@N&cX0y<^NExm0-;tm<3HrEtSt5QuwSBnL zkZXl{#XQ(kLux*C&PusMkpz9_c)sGdiiEfOC-ukQDNg^nRDCN99rYbtZH@kBzyB&f z|0>zvg2IIK?2Z2FA~8uTiz=(D(fx@8aAuj1$bOHKaQC;aDu{&#Eg{}=}K z|9?ULca``LBsk!|ZG_A9*NxXd){TFqL3RC2g5M;O_*!x@@Dd1$f4LincYK7Zu5p5` zrJB9y@LH=~{yh#GOgb{ms_tsd819U?MGQNo@4M`na zL||P)3pZNL+iuU`@9D$BEDjrD2t&jxC`669feC3^SCHfXN#nEnvKcxub4_g6QOi&f zc0WmVu({GjIaXBsn7e3r4@z(Fe8QU>7EWWg`7EZ{#T{7>+lv7N^6>{~xP<(ZDgOrU z|NBh%uS@QK3*oYNurt!LLUKtv1|ilMD?9G+UjOnUThrWh=ZI92$5vv&m6MZ`!h-Q7 zfI>A7BVZ3vgiFhX3kw!{w~G-j5|+18Q^SJw@y%Xb%97owzRU?Oc7DL_QyCr0idBWk zDd^?7^>O9-L@>4bNhRoBjdD)8sc}^O#x0~T^0C+9pAL_jJqPPWf zuH6l4lrOQdrZPNEzs6i#*0)^cYrJeq-oAk9j&4ozMu=IrkKylA`>D5I4FR8C3&4c; z{IQ5VrK#6jUQCBWug0@6ug9g7uXJi064{Wu%rD<=RDUl-5jl;#Mw%?)?=W%kd39ax zM4Irsj&z>etB{~`YP(NukoI-mz=MtOOPkMrg*uVwP|s1~r$ILFf;5#F?3KzbQn;z{ z4HCE+=QdN+qCM(x_=2cWNbM|fuX}I zU-{`_?7jmwt{>AW{K(Dcl{=nByv*xmTkrAsbiQEKx0@G^YsX!G+|lJ2)msr(@!JIyWlWTI)Bz=WG&v2_G41JBPgos?og`A8zSbwLX>DM0tgW zUsN>yc&P;pwcX&|&n?itHZ5)G*fc*Ko<@0L7xcFcq+i&WCqAX}DZh*yC3u0|x7L=m z@9RA>pERmop$U69ItR@L2lodDp^Ub(s_lMN+vQi=B~;thRog{W+li}j1J$@?)wp%k zxJyelvr07!OG)Ha7!_0)Ra7Eq=@P%tCBC&Zbcu9yz%O*NiYnzeDuzhL$dS)$M@-dg zi3{ojvm`pL3?#)&#KeTe@k}Q!>asAArb{ zd%?*E*?K+7OVoDb-3`B#Zd1`HoT&k&PZ_X|IJG?gI{a$#Vqd4vJn(wPw)Wtg?{!rU z|GH7t=5a*4V6ZT^?IiQ??Mv2c+sphMJbvVJ_&4X-jf1fty`6uKtuUKm89EyBaJy>O zQJTrXyHg1=YTC-opG}d7oGQ#T#%nF|8vEqH56b5D(+6`5$MbBW;Z48s`lpZvF(3Rv6iTvxG5;}0Eu%j}+|^62&hVq}m2sYfd6q-XMM5Ya&0?hI#= zxE;|Be_h=85;p{NlZgiAUQ_x&J^G>RUsatCem5);j_77y>DP_N?X2Fb_c#w6^j1Vx z35thQld;)LfTr6Bd#Bjl-4@+I-P8ds zt69cFx+C45J)kLyXmDSK;F79jp;6JshYf%*gWpKOzqv!=Tz4MwG zl!pmb)cFP-!ulevZyjitSYB#m3kc%oc%51YGtBraJ%M{f4=qDi7YZFt9rDfmU*>5V zlx_RVm2c4t(UzWRRBk+mfLd#oMSM#}9_^p^2+^DPuD0&?MR%hnzKKZu-nA9m=UF`??lWttd=aQu zefztCp!@RC=h{)`e~$tNIQUwL(G4Ak&6(A~Qc}ml@!A;Pd!VWD0dY4e=+4kyqi)qE z)}riEtC!a%o@j_9@6e0Eu{R{1{7>W6ICkBRY4($|;E`S?g!R#aGyR=q`G-w)r8NMVdZkCt_*$>{fp>2L zyuc&93YI#1Cw{TF-Nq`1m|jQ!19Kj|tH*@VnDUOyassW)0}m}}*MdpV%=sWY%Px{d z^g@9Ak4Lrg&C<&+k&t1+wmtrf@w#0dqq0l6K21;tn-nnj+m>hT1Dc9Bwv< z;{m~bOHW!iq*U)>ircd|f+izXMr!GC#>x!)6`}kn=-NgO&)%!yrnfc+R zrr1dp`=y1*i)dq%cb>F5-Af*APaI=wEI>F`u|~{Jto!CB&CH>nuQ!SP=!Z!sOikt` z;EoqrtNs$y!x8d<;fT#9={Yjm5>nsu0)qGgf~JD6J+tX$yZ2Fe4+&AYPg_TZ8IP=xmWs^-{=8eG zxvup=0Y0~~o2y~^`ZCRv;GImAQM@n_+%HMvI~__G&D(D!S{_0)uH zE6@!@GQg0pv|Rsr^IPD`EY@o-Q4oI9G9lWkRo)st%h`i=*ylU5q-*4)+PacZ&Ssfwv27CTH|ZI$ z?y85;HO+gXENW75{rWMs$G;YJ<0uKT)WkiZx=Xgt%);`PKj!h-x@t>0OKXKyt@GV5V zHqBd)GmV@XFXgW>YJ}dYx3DL#GA7l8Z`Zq&Qoq)AA*o=er9Ve$*k6h$A}+8OI@Yqv zc^C+(tMN_RF5XkV$#GKC5>+uW`6(iy0{RUYGq0zGReEF{F-snwNQfaOy<2k|-G3rb zuQDM}4=DMXRRrQ@0mVE(!a_dpXEZU^pg_)EvE+s!{%LJNq7`N<#W2owtUAm)Z?>XI zek440&w;olsMz?|V=T^cY|S%gh4?y+}wq_%b0AD^2ElHqRo+Gozz+nfJ~uow1|!a{1yem543&~IP; zXVr=03XQqe`r<8O;^G8&1+-wmvnj40WYA)%{y})73F6O- z?y+W9&uv~A=d;TLCOk>&%PTwL^1|alQrxIGAzh^TadqUdTZ^Hi4Jzbyz%P>=r1jMo z*UC&c(*`|Xmp$Dneh@GmrP8tMc?I*?rMd9^3^irD1B!j=d*R3EF^k-`)1oPB)=e>$!Rr1pFZWM~T|CH~3 zhI0$O(j%e2;)aCsQbo9#oiE15^jVmtFO9GzwhWsLvHI2g6*%>M2~CG6bw}d#JHOZh>a5j@MVZ!=dR$(p43*(C>o%cF2SVX65!h{`UUFJ@M5L z>+9~x1b6c1SI0YFKv{>WDO7M+8Sv!zEBZ(}aw6i`1ZXerh03D)XK`7_4rhty_uOkz z3^x9v9}*@ny{bC@G3b=P6*yUOB`mN9`|tcc|N2?a0pF5*YnpJxVy%wyO({QA2GSGf z_K&+i8g_!NArQ3`YUDZWxpm}4$cwUI4H-z2f(AJxDwK>%0cNgg}U&?$&>Zc6!93i zi(o{8&KLWn6|!U&Uze!J{(IHCj%4Xiu;V?6Fqs~|kKL!3<`Y!l_G7t8U;ba>9|;{n z)n?w$3iYWELyV3E`_~DA&bk&3F(G^&o_cTh?zvX})e3hIG>EqA4(tLLJ z18=*hjAaOR1urVbl@%tUftDG?mhbg*{t9+6K62LgzCQkUw${8Dtu;X@Ni7v*`@5$PN)OE6UEuKgw>ifrhOgEeI&V#JghuPpit`rd8Q`lq@5`YfraBF#biqwnZ5 z%TnPT|6^z6Jv+tArM=3V#!5l{>O*9y3F(3NtXYlm@WRKjyw$7X{--K4_!%=FXc$3KZvPNA(1 zf%X~FBiBD6+lQPa{|xw9St85cfaeI>b~4^(w%ISmwNeeV&N0XNjG3{Jco~x?Q-0n; zNma-qGSNC!zt-VJu~~W^;7~Ly9?cB}-_R~a``#Eoq3@HTAHAhv4{H)qNQzF=V_lNw z8DHXk#T693DcM)u{!}b^Rcf6tAPVh&{oLMfV~qUWCnB;4>7;94gxBRTazEn$r_f;; zMm@hAzwqq45im{rz^WU!`F0{u>cXizdhG^_ftPmA!-*dgnGgR^8DC3(xPP<>Mt$is9o@^yQ`=lK z9no0dISxQ7U*ntZprto@Ro#r*@Ly==ZUuj2OQOu8q!x80r-^_wIa-B+a?Lcbf~2Ij ztE_JEkt1arnU!U+!DlxT_L~(*sD^*s%j#%?PAV=B5*!K zs<@Z78!hNek0i{0^mGlh>O}(Y_pT*|*!sCXCz#i1dpoly&6RZB4Pn=A!h4;lYEz62 zEu;*?#0l0A2Az;uHCl!YVjg`=O(H8#*lvn?5N2EKOVTt z71rthaZj4PZVvPk9G7uSAj(rP?nh+xHq1vHs@_YxyiPp2-y6uXTdH3miY)-mEy^~h zc0-$xPPQFH086w28bS(hdE>~KJx=q~JO?&Sltcf0(=5B_E3Ooc8S+-2JDl4=?`Rg~ zQqzS{)cbevyV#WRsYHxG=MJn%l+v@ViB@FiP*|eUGj#P#l&gWHf^-<6}()={Jp>^Lj zj|ion!YJnck=|jPj`hckxH`ox(hxVmq&$KYZ&_hFUl`CL5RVjdnkxUKmh0iK6V*#{ zR?Q)w*Uj}XavpW|4=Yt(mtQ}|gj=6=)*Fw?4~iiiAwR%ILrtH3;GwfbZF9gLKAs9g z?@EnzEc^Vs*EQ+$6Q0Q_ zN`oU+U%t0P3y)I$gKjPQe+bMZ=(rI*LqUCRPg)2bmNAZW+TPxmU4u4d3mUA_ZKwd(aj5qeElgTRZ_0v@Mw_2np0d(gjSZZS3{6>>5=IG zVE?FK*LHXo!%m*WqchY;ct_QfJt!6_Ya9?l-)dD1adM0UcQ;v?hxKPkLbh_+*q)8I zaE=o4jX(P%ytIY(uKp)2=}sl^N^ws}2Nt#*Ji5u$fl}+&S11jCb9d4unuNN)c2}Gz zE^N{Sm)u6!?#6C%JQSwR^&A1mGXy~~1Pvu@DQ#pDPVm1CPvC;hYs=HOIfXR2P$ZEc z0&iZPI2A#-&kq;stbJ)6`<^xWe2G^15Gpp*E0IK~+fXzhJFPaZT$y?Xm%H9O&2WeGjH&fTB-CBccA!_#tbmiO6BEQ|7h>}gDk_6 zA0~KB1m~FF5;P}m?kFd)=7&A%=NssG!gJpHz>ubi1afWlLpO<1w*1nzL#+3efHG4| zTLBH6i)bo^AWQC4Z)f3U^Gu4jhxKy&y?jE2I{iDa^E-1%?3@4G^aOzc%Cfi*!L+Y+ z`intan?W-g7^wlkavH5gDUF1lC@b19*mjVCy;W)BD0`_wUFcjJ{cjOaq;`M4*kN!- zKW*-yJ*`_?_H~bcxLXM{Hri!|&u_Wr47+r+h9+i>{UbFXK?h8sXM@UAhXgL`IgAP) zOpAUb@si_XG5=j!p2@!zBQ+0+c$1oeanvw?|2);OgjqbcQACiELK-h)BEe=)g%}XI z_K^qJS$;#$8Sf zA9Ejul@AcA{#iK7Jw~8>e8lJdR(kPRSm5vJAC)2Tq1XZl8AH<5ej(TZ)vP`HF^so? zyh+#pf-Adf6A?^~3sxj&lNl#SG#9B3_ozAlSaY3t9Vg%=%}?%4K?}{-rx9?{AjvdD zD4goKJ3zV`>eCm{Ja}Kn;cQ5AyZ$`jT3Kk5&wOy<2FTCrJ=+v@u;=;KkuXt4&@b-a z(DG;68zm7g4co|6MK@JAQY+nBz8>!p^KT-qF#^-r{K_3IH_Hk_aC}RC3pGe^;6~-1G# zUD*s{q}B8T<%yJ;&E#R1X7~Dbgi~m{W7O6>rMqMK3d}L0mwj2hibq|X`OvJOtj;7-k2~dUc%y{Pi{w zDCX6ut!%FHvS?w-iLqz!WXlb<`95F2^0lAdN^pehNvUwJT%MVO(mY%F%!~}|IT=n>o$*Al^CRsGxLnN;#}9D-tTO@D*mScT+jJyLM}@9BfpHoNwmUS4!3l#v zBwmDx3Dx=#$|xELt7f8oZV!@)R2im=^1GY`c#_qZ7U4pw;# za*m|8ogU$s1qf#^)(kRb_7X|%hj;~D@(2~lWn09WT!#iS-BoN8G@t-9o#Q^`Uj#H168Ri0F`$?MCa1*Q{=*H6L z&@RAo3P>a(3}Dr=xl=ERG`Vei1FRxL{(I6m)T=&ds?-!c(kZ#P&25jv`pc9bECqj% z^a=M$*NQLLr(gG=-TjP+*L!nN4+`lf%;sv!Ch;15A|yen5w!Zf$H|JM=_%)z_^S_d z^7zLEhh6pBJr8~HUTPTgoQQHnqSgjN@m%PfHum@!kII0S%OMZD)JHg#v;NOAqL2Lb zm`$a?^7ad{w<7VM=+-ENPrv@V-~+Lzslw}5T%=eG``zq0E; z+7E6^;<>*@n`lmmitk4@h z*?XAjG(&}!4r+z=vM}64%*T!G7FrHFf^_Kis!A6EY0T=r6AU&E}C^xI^n>T5z~soU43@0lSR3 z;=|QwAwuY)94B>#2)zxIBvhc|w76gt)x1j4qx25u)3K3=%REh@zOpOIjT{5J;

6~#qlsR_a|HAu2%=M%vB= zawFe${UPfdQ!y8!X7sTxJFMTd3Vjp`8bZF?2L3F-P^-KIUhC|;0Us!y^R}4IS0HQ( z?EG=mw{+7Fsc#KR>m6X8*&w8DdP)nqQT{RIVA!03qmbSfVUo&3EtAnj<$;LzoFY~I zDcLTsKF}8|_hi9}+&s>}j*#dfGu?%=VG28s{jrifbMT!*W%4H#FXrq-bJ`rrH{CM{ z-WL|UCpzW;!pkOlizF>Zm1#`T-FTMX{nn|(^_l0233%wtxgES~=hvtj2`0SeY|&l# z3!~5((VaaZ__Ywv_cH}!m2@ZwdS%$vG1*%X5iRF>%=W~-s4cx?LuT)nKs6e9_QcN* zno!>OcWpAn=BPS*C}03Zh6tl%qRoZ_me~4ARY^@+MfH_%c!hN8pkr;Y8zsrx%KN{` zvwJDTGj$pRLn)Kb7ny)_b>@6au;0;swwO&ashxk_$_C&g;>6X;H?0Zu>0`NUMhm>a z9|xm!_ygW##+?8<#*+La;7*)hTZGu}m}vHGdwQ|H+08NrgiA+AGaPmer$>ea;}laF z=U6k$)9Fk%$~TkHVxea=FReW;Pw9=}ACM@DPm&J5%M*9@m2KHJgGd#8E$2;tuKxgYSY_{S0sTjZ>z)wUh#1NUdQ2tT!P13 z){_(u*&!{{fDki1(@kGn!d7Uc#}4izMz>Xkh(|iPn6}J2(yIl`+qqcY1c9}L z$D#DB8IcM14PlM=w)M*!I>^A1Xjb;SgoN874HhEsjD(|kFkt4Bz`cwYlcDDw{3~AE z4i1Uk&Xlh?v;R#%0^L%dFhXuD)vbxflOBV_@m5`=#*yH7v({+$2B2jtT#%(ibqgt; zz`XMc1yWessQPfo3NKRb_aEq@uOrlc;^%MUqcX-6cUP8ai406A($tj)+7$Mi{&EWP1jtzvI$$mxS6ir{s{4OQFSCQo)d4R3%r zEz|`+>D}fSA%pf+fn)lKVsxkgDYV=@a?AGyH$;oN7jn?%Ycvjl2lwdvLL{e^rXdw& zOPJ$M{KG=lH{Ibe8^rnXVHPtjH~l4;;Wc5oA5G?5^aM!mEzuP*st8tGdgK>_P$lex(x_$^D$~aVMVf!co&I-Zz zt8RX64YS?(; zW5plUS0ZnPa6hUIFt)O^h>DzcM;?U6ciXECaCV!+iS?7~H*myq)4T*Xcjuto=;lZB+pF%IYE`2$p*pD2TaK*Pqv&<0gn7UqRXnU4`QXL#v$;(q+yzFk#|FVA zfdB?eljXKs>b;PURSHRgz337aFaq{wi_1%cSi!mUVvsb4N`m7PovLz z=F&sWL3JS|Z4(+REm9w;G3?OJ-dFz|n%@H>_L4x+mOn+lb*&pI-WcOcA%q`AGEX>3 z%%{n|c(2QTEU88RILEs^($G(<<>qLhh)40z7e7tC+v~#P_X?GI-qME1!E{JblfVl| zpSu~$hOei=>>AZKQDog}It87zj7e&>ijRmRCcz})pP!=^KM8A<$_yYu49yjS?+l8RP zoLAv_1JPX*RSA5dvadg;%TK?6U9E^(x!p2zb+<+)buHgfNW8STj$QM`vvZII&+(RK zmQL8VcHbF?mj0ZdDok0dQr578zz637qXI?MEwve;_OIW&AZ@RNKFZuoaimNJdoxkw@kG}=~ihujOE4cPo{nz_?x z(PI8oWXb8QcI`2-Jbk41>R7t_UZ?0tl{9 zU_4oA)Sy0QiX!KI!_XvaG zf+ui`ZA(%%ot4_8C3PqKyqyw`FzzC)FEl^?_mceg2k}uRCiI(-{r98U+nOiY@4cKF zkLHk?3J7d|%=C)~QuN;*C@d6V)%7qX1%2i%AKzkyHQcM&Jca-`Mqf^^aj92L3NOvcP5-ncFn%l>J zxI>*txE)W<|Kr1w{${cyi*T?FRdT^&*S$aG+)238k5t?76~`65(;|Vk;lg0jQWSCa z%#_CUi27d5k@XYZ-KbIL1EGd)WId7cPf`oJZWE?S^mr*Dm%D%b=kTaIC4n8(j;PQjuNd8vN3~7cCDYFQ}&nEDbk<8 z$xXn}&mTX64J7llDk&yJEK%V~GL^A5fa-qtjflKl+ z@)%T9<8q(`d+Twt;K;pfW&5hxxq6K1nNnNsRA(%pBa7IqxIe&rgF|W+Dleyy>hO)s zS{+SZDHjHLzt=w!D^4|l@t3degxm;WIZu!9tT@*ExIud6EE1HoaWf$!xY*5&a1{B4 z$<5h@C*nb2^_9{LiXU7kj+!owe&A#lqRg!FQc81yEsMI2Em1Nwhud_j<;UhpeuZZR zV&%HuB=_wHp7e9hF*4X@j*un6yzHf_$wPnZ9OgbjN@YK)h0JIUQ^<)cP9FA3Ufi}Z z`zIEN`5GWFIc{f57GUjTf@CVG)Obn=a*)E^aH=`uBAnIGLHhJYD0AwHI2pNSnyO7& z!Ms|~LDY#Q*blmDxy-ecn+*X0=mEM8le1yI1FU#)Q_Ilc)}U;dxr2cY_dh>3Ymtz$(V)%!n)2l}IozPPe^XC?Ig2BcOx%uB8cy+#poG4!rxsviK}8VXq&fQO0E_xGvg1lpw=g7I=kZxL3~xDxM=e z{e~@Ye<(ziaeI|Z*WqvaMDYzSmIYida!WO@FFCwH%?P!;AWzT%??`GmOaxD71KrM9 zZm~kNSZ-LWw(z;n{hm!@z&V1Xq{CXXJK({{QpX|`$oMsm= zZf_HAG2Ur8aA|k9*BAm!zArWp7Qiz)zh7?bJGSNHDP%1Ex2y-^zU0R8YI6`zq;2~<_FMn zaDNF4tSxCG5%)d@#uDL*=dd!UoG@f6R(*WFGBEySrO*O)zOuHSQ%Kb zu}4^#N8{?w^D9ds8#))-2#20MbiWXx63A%1*4WMf~zOt zRoSne0vyakS}G2D!I68&z%xMZGA^9^VyO8~v;A5A^#|kR?KTnIEST*+xg#w?v~yyv zStbx3JWs<5S4xIvxB56z^p`LNH6N7Mk4&}deA676*XEzL+l^yOcBUG}pv6bPvu{xw zfv|MEATkRt5%=C@rdE81mh6$_{RMBe-+Xrx>Mhty1m+&ho_U@F7xOl4_TFjI?0Aan zAHR&iXue0k^VUg-bWDRuLCsE0yE&X?5mPhcO%>09u=dNJLM>)tT}#`68K^c4dlCSr zeX4X*?}H-I24dMCja3t(8HO95819kkRuZQuylIaM`QNPvaeTkKA{`a4EXh=+1IGiaMzlqy%)ARO=uWCzhxA;7B<}#*2ly} zDe6h5#ygIV6*hP}_*~zz5yQ9G!!iH-0^i6Bd#VrJQY6nR(=p6=2zdxI?gz?-fwBN8fE!3S80W$*DZLn>rLX)-Ccem|fDJzt({g5{9 z5ki$UYqHd^2V+Q)=G);`fwrV$EGrQL;v6r9g2BXfp8aDs;Y8lwB--&rKBSLZWfrj3Dyo=6dN@Bo{{phdS-j8jYgqGc#c!*Unr8FK}oPS{dxC^R4MUe>h zf}22_M3r3IDsGGeQ26VgKCAW(b%tvtZ|kr4X9aiYg`{2~ZDs}xGTr~V9SCmv4V>`j zX$^UcsjMhxEqLQ4)XdCj0q@l%fTGU1~oPZxs!=ZG#&HOV?bJsU$s>0{V|# z*40O8%XdOd^^2YWvO-_btyRnP{wz=AVu(9e!|K{wzxHiAI~W|ke~8B6M$1W*O2tYc zg8D`!yWMRN_59PXmbvx`Fs}nxVpsx3mI8q~2)#*YDM!TA{9V_^NTA%iCGUw%sJBq@ zSGw^B$E2GDik`x1KPz3gDsbO0+7^2}ZjUnRdcL&D(jFnQx3tIRYkoWooW#rra^dY) zFvml1XoJr_@MWwD-iqArpTeBm$^c6J#D&5RxwEi73@HfR}zn zi!iUp3sjWKX-3i<;8N}4M?;wa6=?c{?>4;mGa`TipHXMef%`nsqGLvm8bj&^U^GxP zq?;sPRn}H}j=(o3Hm)%7C`WNcama7mZhpI!_P&I-UeH-;$^XZ%Xy1-M7Av^vn;YAY z3pV>G&m*i)mgV8PRYQDE)aL>mh#8iVJLNc+czUyMw8!0GmIL=Doh^W|fv2^z3`p)T z?Noh6UKAW>*#O_NFYXSapPO-WW*jqS81=L=(?yIbt6rV_iBsN8z|??WlYz+f?T~Uo zbp;{6=+Y2ChtJ&=l3~_)zgePnBHpZA=E(F9e@GLvbAg9(fmI&{ z8Wt)oMX-|&OUd+LoU83V0t4-UF3~KhFyK$F$s7|_6$&3j=(`5J(&?scYQft@qO{Yu zXXcv2^cIoqR)!Vb)Qzd;6xtJ*fG4b0g+hMfiS7HHRC|W4(XMzdOKss>f9*@%MasE` zHqOZKS8=omhvnw zluF_yov=xDDEmf;vGHw6#{#L(BxI3h3Rr>rV(+wuT@SdB%^OkQm5RVwvCCD*t@vIG z+kwH2_?4~GO%`Ox_~FFN&CE#VKcu{!yLy~>a;TbFC8>UJB|Nj^cG_ zE#0=>4P3l^ey`@7%Z+B3^oJHHqaJ&GcW&PL1Dh|?p(_WN2 z=&t$LHsyrKNeAUW%KYO5TW^+7^g9!yY~sY8h9wb7py)TY^tgW5pCW1Th24@q6sO5F z@@~798g%09@xdZnM#`}gTa9XdjkcH+iM;wkH=~kbT!KV%?b6r{L=m7!Ebe!TjeL65 z`jd#%jOrj=%b<)gWMz{M!F@brMHD$@IlcG`Ri<&M;=J<&lB+HxevmX?`jg#*U3yC2 zz0XmUFM3GLY#()aN$`81lE46~B7D3=uI<;g_m;TrhP~*Rs$hK+TsGB!WZ2wavgUZcrAz{~!z4`Bi3mw@vW$y|o!wct8C)UzFsT;Ve4;x10nO&x~nS(Sc3Iy1Gu z5pfBNiQy*+OOw9rO5G*ADr7!?x&Epik-KZe)IA8li0fte>r+*^FdRK*~-mogin$x zB-5^FrvD8h8RE~Y{F$mO+)~4Kf3Ux=(AcJ}{U3Ava>-^twMHffUIhn~up1 zn6!1;<81H~&AiVX)ubfc`X7a$2MnThSo}W_y4wpyM!>5By)!xja~*pf#L%xM&)8jG zI;SGSDxTB%9Hh}`NPgOn-FE{*L>AGggP82rmIj?Gz!tT27H zqbIz-(d6N8SoJAVt4pkwDizmIqWQ+A35B6UeIKdaiJ?_$W*p?V4COC3O4TqUmZN81 zwR~~#x$Nj$P{JTe2&Vh|nLIqq6{k$ICCbh-P`By(`5um`y%T0%59&r&kZzn6I0ZGN zf!YY@*Xv^S0xV5OrG6VxS-0A*j8ju@(o5%*TqF!@l*A$)J_4O)Z_6G26s64IUJAt3 zG+VB)RL$qpdfE`Wc7iAzI*!3zV~{>D!?XY4^W+oq7OSMy6-W{rj5}dLExpXQjM~KW zYfcBhE0Rkr;Ry^I=3v@0{JLM;Xm6Um`|6X9^YWurg@QR!wfcL(*x{-IF%WrO$Xdc^ z!=c|%l--_rPgBb{%ron1wnCkIj6<`r>wc-nbZW1iWHH1ixe|TPjC9^MuIS}6_e za<==s3DhJj!Q`{A@n7MJiZJ}?>B$SI^6QvDL6xMP6ptkCAP@Daw|8_etV(?f*tVON zu=f)2T%BIzh%D_BBbPr#Vnq7C22`!I@?B;JR=EX#!Z5|#T3^KSqSo{*t7spjSzpK@ zb^n+rsv>NiN$=8~*yIN+7T6X1BNC`pJUz!1_GUPW_)pBc;-#)i$(|^ao3nmNd z_;4zlO|pMQi_ep^Kz2-qWnskbi~@4R+ANeFseo=7UQrroTH8s8Us-jMQTwmMsS`c6=;;kh0{5SM(k|9e_Z1sYdCa4Lo={Ew{SY zf605ZY$|ecqy7nk2U6Jf+%gX8`4OmZ8?Zm=cfjL!P$b-kS*ldFIH6%I5nlW4H8$n` zMgk-uihjadQP0`!mDw*2Y#D$3b;7#o=<=GAqwftW%;w)a$9T7hfwG5La%eBqKCZn% z3jELoeEp}@otz?v@kf>fLm&OM@ys91T3wuk9)0s{%#VSXRRmP!CB^&lP}z)?=pbBg zZ8cf8d;Q1EuWAi(x8L9$;3^i9UxvB_wY#d%dJ((r^-&-C_)TVm5@cLrW=L1h&{T>=~Y^_?<{GeWJ_iUx<(PKLzwKbi{bAX&(JDIF?)%E=~R2d2df z@hZNaJ>W}C_Q-6gy?zc| z9#ySKn)Nyz?VFTOIRpG2$M*nkVn#~(N(Db*76Zv2WtaT;*V`nQUI6oUc2 zPPs`3beCMf469DQ#y)=mqe?`u3-_g7RGprds=HO=L8J=0B-u!t{}a6mmk2S5n&|hQ z%y{8XwXGM3VlPYl!k;GG`;y~Ms2cFf`O=t^<8)nsNFqw4H0Y^n|*#; zts)j9L%XRH{vyA4ATdnhL3c#WTulXuZ#+RbsjI2?)zDhJYbd{uweG8HJHNjXM{mZ| z)IH2&Ui65lQ3h;{dlr^~{-1=-4-!Owqc{f9Pb zVJcy0{f-HL+3AX08F@1&ojR6> zkT?Glg>yVO1n{sXWj?g|OJW98HOEG_89xSFpjmZ^D&Ooma1y19@Y!(UJqIS|-`Tg^ z&bj6Xa>u>ZFvi%JOo->-wnJzxT#0DbPBHL0w?6-kWz5Z-ck<2XqB_b?>RPXPiU#fkw0w;zob} zoZZ-XzTW>F5|6pR6WV9IvHv9zVHpHmL}JZRkHPmL3fbkFV^|B2CA)a=9^ysMUaJwj z(Yc&>XX9Zg{xxkq?q~PkfP?RlujVDq*BTwmH6f{}>x)Q|{rMgLv;N%_zftt3;+E8D zcgEC^@N|czn~?Q~Z6@SKGs3s5;VF3C!m}}bv_pEd;r1Gvm~p$;;WYQwv8`T+O8~Z znzAGdYCPcqvPy{^bl@5aT$#A7^7GNS;tik3k#xRYojp-51Z<|NtBU&$L6&UGqZw){ z%)L#2rukEGFlZIRPMsVKZA8{KjZHco6zH#pzI_&iFcMv1Jo|Z8u`0TA2yV2-Dg4KL z4K2gtEo#`)6>(L)np3c}FQHnN0yC#$D;S9@qg;R8y@@0j#Mg>uo{z-%$nn^J zQnwlViXFj`(%-z6X*nJ77pr+@5 z5qG_trl4738`CVbSpA8rC)x4S-kEJV6&fAqi&_f(?fwl7AvJ+TSc!E0`Tj;yB{$TO?7X@Vz0i2U zUZ^(u9Y$gk+Vi~(AVr9!o!oRh8%B#MR&Q4C{!c?1cqM1UHE5;cprqnWeNG>zAQ^_7 zbGQa6`$3E0_etKxMh=brl*?}|6jWj zLaTQ6x>jjJWBLAp|3|mOJ}RJeEBaH?=VynbhhtCxf9a$DhX)(1#{Bsb9?mAj0ALng zzq+rIE=)6zZp#2wst=hHnK<{A=aW{)g^cS#!cCT?po_x|j>1qo>JswoMKI=nCZC__Eo^LDe?d!`5X7b&t24xV?fU|jWEF5hDMe`bi>A3CtvK`AV?D6gh1OjfGp zdl4RF6`9+n{$EQ3q)A+BpVK>j9$BNoxzJ%2^$t%@l0C>FTk%Y(r5(Y@l+6e|KL9(E z?LV-xOFsG=_#T9_`XBb~(7Y9QnZiCIO@n>S7x`JY6qOb05Xd((RzS&%wQf z-UcDEh?_~-jdiwRpW`~77zdhe)^N&l&*85nG%rn#56O!gbp>$^Es=d-n8*c^uglgi zsaDeU@BGx_m?z2bDEUQ*`hnh)d}J;!1Nt85Q`>TgYY+bVa)W9sgMF?uM%L&m4?rBuUaKRb8kK$RW+O_DdEn#Kk;(dcclKEXzoHl zMCxd>J4@2N$6K-dbNP#t+T8UWJDS$W0A)!HPN^m*pmgQ=cirH3+kgJ?aWP`Eh4bwX*$O18?p0iC`n%~OO{Y4YZ1r!pXO+{T4r6~7EXm+QyKDNAV zD|kv{NK}>zY1!j-k94+@HR=%~gnaCM9nl-|Y0tbLIQtjL%gAt=$m}q*#Yz<;*Ejdo zlh9S(#Z7UDhF*(6c1^fq$8Dh+1U8bzzv*U4Wl&i!ARJe3{pFL(-ABdE4kP!Za|C1N zwz=#iy`Zm~KvOy0NW>pq0EoQBJCRXw;drk$pGY$4)?D3%a#Fib&{&B*`K=$FeX{x; z;L?GEMbgyju$mH2^QfPi#f9Br#Lst-tgD?q(eE4&GH-^I4S#Q7Wva9EKv~}bKvJ7N z*{4?h0}x&&0FGoec#5DPs)md>FLJQ9?El~X=M2oFxf3q`f7n=vPlr;%2xV5nM&+Y} zyDSjWEzf2`tV|{Jre463#ttfYNkb?KjSJClgx!DCv576!&S~DBGM46_94*t4j*r$W zWp}nTPu7}k0H1L-_3v}M@$v_?Ua1HPtsYdM=MtVJ&Zv9kR=a6Fx#sn(b|*YgWabUs zd@{Vcp}Q38zFt`q)O>9^-Mf6fWHQyO%AV9y>pr~^W>u@E*Hub$ZLBii3`5kaOiTQa z+O9q}%d3h{ha#`abm|y7S!H7b3G4lO?gtQ91?(#0BV~)JvuquGY2R7;LTL$AS=B9p z3=_u!BVlZ0U?6dbQE(2CIT+z1V_mX25#82lrbHMaj8un6>hC`9`?S61HiUmHe>BhG z+;h%7_nhB3_i1vTUray0XW~!aeC3V}UDcIOc2(c8zH9Ys8@o=8d7`Un%ecO7JHBu3 z^f{xaJpS}EPcNR?eZ2eV$UpbGlP6Estyr~iMeB;AyDE>4Y&!Lq-dQKtz5lg+FWx$3 zQ+>;cjceCb{`$%$<^0g=c(~xzPf?t+VAA~RVtaF8Q99#r zLFe-J{F0j1#l_szskzG9d{Y4v=sI)bs~U?B<|pJT=e0HF+X}5sx$*bTSb*mR9ZQ$C zZNnGljb2?`+|FJ%oO*!{Z14GWay3&wyZNNL^ zkn?m5U^r9p>OZnx8W9gi_YVfFpI=p)srZHO-F;ov^OgH=tg4yt;p5LA`scNcg|oQ_ zYj*zqk*=N%<&lcr>|33u7fv~M{n2^3?MH@fyYS@F5fev0`jOjD&3gL>9k*TH+V)gk z^;hcND$YK*_rjvjEcxUI2j6?5`H`2`e)`&JBQ93maO~oUk@xR-X4&r#uY7s*svmu0 z{Ksyn)|1bCxT|ScUE`5%aqjgMjj#3`>R9`UIR|>j93HlI_8&f8b7{n^`<6{;?s;MT zoyETMZ|?fy;V*yi(*B>H`|-s;u3xpMVaq@2PCRt;#B(Qk*MC2IV&}}wds=qPJpR|s zM~-}S(VDt@yRX~X);WBr;J^CW@cnb;``vc-t?6?QK5*0O^Tns`sd!-jH8rE}Y8=Z8 zn{S-l-q_ag#^+~lnt$lPv)#9jc}cFk{`OI(dU)lt+y3+7q{pUwYFy{EV{iQCa__kj zU#))qhhz7iKK`qH6E}!`H9L;wZ<*S+@a8Ss7th}}?RVzVPqw%I_JvoE6qa4y-F{Qk zD>b$6yz`x!v(4+?GY@b5+=MZk8mj+QziZT^+gAU3?Y0A}H}(GQ^u`@0_SW}wZ~gX{ zIxfr^xBUJ|m&9wBu6tWw=W{)B#oOEKr#`YK`)TfZZtKq;9(4`*x#zK4E*|Jvci-Lb zZ<#!{_w2jF?|ir8PeZxS$5Nat^I z6(M~p;fKdYpa-y{Obou2puOOD`&R@Hy5pg~li=s-`+ozl;e;6k8$tnaYT6rG3X7^* zn_BW-_op%?NyXp|mZ|6D0=7S)oW0HbBf~ zxjKPyy!{fg4`wJiNNhH{)Fw<1ux247}zwk^TjfC1?^+SEmI5NozA?wpYW5(f4)h?wn5sD4{B^}bk^ zzAZApp((#SS2?@TxEz63!W~|UVNsguD*d9`oGhIuqQO65`{}O|XSrzV((qwZ0{zqj zwJ1p`r}O@(&AHMOs%HAgaX6mLc}X7UPvQfNj6)&5`@vRUq4@oei*n#FiD&HNHPiSRZesPXf2n(j{M^xw}kx9kk z3Qyyb8GKWuBK_Zr)|6J_lx13pue>^@aUHGYnng6hOhl6=(^1xog<3^?aBXBC=a#Hx ziPB0C9iUliOQH-X^0-n|9v6aX7s~TAs~pk6xM0*4xYdMf88=48eiq7cDo-SwuGP%y zPzP*5C?@e*N!Bt3q#`<`FpfohQi>sH)Th`-B8X2Vb52FJNU39ai6pjYA(bWP!I*M7 z^0RV|>J^srkk9)CxkUYvFizIOdE5{_*kY)CjAjy(%Gm*oUKgW6bh6cCF-Ot+B#MLSxJ&hMb$Hk^BIXB#uY<>vZfF zXDpFtm3C1)bJP{;KaRL3YYny$*#|yB_!!Wd#t=3uqj=#`M0v?&biNi)68Q>zQ4Vs6 z01VEOGikJ-zQR#P_y?LMe6Tq&gvNXr#59wH`0VxCC|9}WR0jji(m3EK9npDcLvxTx z%LrcK%#2i_4^o;Qf<|KlYpGuB!-_aIoQ&fD=|*vgI~BzwcL~w!btIhHhZ;c65)jL5 zgq^-tP`$uxBG2oH>JdYiN9%{Dxv&O6FJdyLDI1-+jq_3{RHPs#Ax{|>;<(2b%|n{! zvr*kgDYGH2VQ&tFkm}GVAu-K`YoSA<+(vwo_`vIk<4WK-LLWe$h1FD^6;YiLR*`e_ zI+7?(CA?4Nffn^Xppm#@44ps{);zBR4L7k*q6AQ1NdQ}D}r^N55*@~>vdR7{fBluIycawx&>Oi z7VUVLN6-Nnr{|%7u@wJ6BlriKfw7T&=viYO61S@G9;FnDMMOgph-p#%016RXI5SWY z$wMm;$xBRxlPbZrkLuNNPpCA#2Y9_n)W<4=6Q=Tv5>afJI4_YgWWQdAF=1@@Yk-rH z4!pvMbO6@E^FVtK*a%}B-Cu$pmDrr%!a(JL7UjMHK!CL(Y;uC(U1S$4O}Mv5 ziHY}tc9--5T%bKAImq$Smi7nWLz@%Ez2`#{M{MzbRSR>=uN4G0I2wLS?v0)=p>c|4 zQjvEVSc`s*`U)L-v^U^*ED!w#wb`N-2r-P~Bz2@0Z8oW8Xgbm6(DSvpFAyEzgJR+< zT(_tXB%q1(W?qM;Yr&V$ngTvYdVZ|MeUje9pgsuB` and `SizeTag<>`) are only +used to select among the various overloads of functions such as `Set`. This +allows Highway to use builtin vector types without a class wrapper. + +Class wrappers are problematic for SVE and RVV because LLVM (or at least Clang) +does not allow member variables whose type is 'sizeless' (in particular, +built-in vectors). To our knowledge, Highway is the only C++ vector library that +supports SVE and RISC-V without compiler flags that indicate what the runtime +vector length will be. Such flags allow the compiler to convert the previously +sizeless vectors to known-size vector types, which can then be wrapped in +classes, but this only makes sense for use-cases where the exact hardware is +known and rarely changes (e.g. supercomputers). By contrast, Highway can run on +unknown hardware such as heterogeneous clouds or client devices without +requiring a recompile, nor multiple binaries. + +Note that Highway does use class wrappers where possible, in particular NEON, +WASM and x86. The wrappers (e.g. Vec128) are in fact required on some platforms +(x86 and perhaps WASM) because Highway assumes the vector arguments passed e.g. +to `Add` provide sufficient type information to identify the appropriate +intrinsic. By contrast, x86's loosely typed `__m128i` built-in type could +actually refer to any integer lane type. Because some targets use wrappers and +others do not, incorrect user code may compile on some platforms but not others. +This is because passing class wrappers as arguments triggers argument-dependent +lookup, which would find the `Add` function even without namespace qualifiers +because it resides in the same namespace as the wrapper. Correct user code +qualifies each call to a Highway op, e.g. with a namespace alias `hn`, so +`hn::Add`. This works for both wrappers and built-in vector types. + +## Adding a new target + +Adding a target requires updating about ten locations: adding a macro constant +to identify it, hooking it into static and dynamic dispatch, detecting support +at runtime, and identifying the target name. The easiest and safest way to do +this is to search for one of the target identifiers such as `HWY_AVX3_DL`, and +add corresponding logic for your new target. Note the upper limits on the number +of targets per platform imposed by `HWY_MAX_DYNAMIC_TARGETS`. + +## When to use -inl.h + +By convention, files whose name ends with `-inl.h` contain vector code in the +form of inlined function templates. In order to support the multiple compilation +required for dynamic dispatch on platforms which provide several targets, such +files generally begin with a 'per-target include guard' of the form: + +``` +#if defined(HWY_PATH_NAME_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef HWY_PATH_NAME_INL_H_ +#undef HWY_PATH_NAME_INL_H_ +#else +#define HWY_PATH_NAME_INL_H_ +#endif +// contents to include once per target +#endif // HWY_PATH_NAME_INL_H_ +``` + +This toggles the include guard between defined and undefined, which is +sufficient to 'reset' the include guard when beginning a new 'compilation pass' +for the next target. This is accomplished by simply re-#including the user's +translation unit, which may in turn `#include` one or more `-inl.h` files. As an +exception, `hwy/ops/*-inl.h` do not require include guards because they are all +included from highway.h, which takes care of this in a single location. Note +that platforms such as RISC-V which currently only offer a single target do not +require multiple compilation, but the same mechanism is used without actually +re-#including. For both of those platforms, it is possible that additional +targets will later be added, which means this mechanism will then be required. + +Instead of a -inl.h file, you can also use a normal .cc/.h component, where the +vector code is hidden inside the .cc file, and the header only declares a normal +non-template function whose implementation does `HWY_DYNAMIC_DISPATCH` into the +vector code. For an example of this, see +[vqsort.cc](../hwy/contrib/sort/vqsort.cc). + +Considerations for choosing between these alternatives are similar to those for +regular headers. Inlining and thus `-inl.h` makes sense for short functions, or +when the function must support many input types and is defined as a template. +Conversely, non-inline `.cc` files make sense when the function is very long +(such that call overhead does not matter), and/or is only required for a small +set of input types. [Math functions](../hwy/contrib/math/math-inl.h) +can fall into either case, hence we provide both inline functions and `Call*` +wrappers. + +## Use of macros + +Highway ops are implemented for up to 12 lane types, which can make for +considerable repetition - even more so for RISC-V, which can have seven times as +many variants (one per LMUL in `[1/8, 8]`). The various backends +(implementations of one or more targets) differ in their strategies for handling +this, in increasing order of macro complexity: + +* `x86_*` and `wasm_*` simply write out all the overloads, which is + straightforward but results in 4K-6K line files. + +* [arm_sve-inl.h](../hwy/ops/arm_sve-inl.h) defines 'type list' + macros `HWY_SVE_FOREACH*` to define all overloads for most ops in a single + line. Such an approach makes sense because SVE ops are quite orthogonal + (i.e. generally defined for all types and consistent). + +* [arm_neon-inl.h](../hwy/ops/arm_neon-inl.h) also uses type list + macros, but with a more general 'function builder' which helps to define + custom function templates required for 'unusual' ops such as `ShiftLeft`. + +* [rvv-inl.h](../hwy/ops/rvv-inl.h) has the most complex system + because it deals with both type lists and LMUL, plus support for widening or + narrowing operations. The type lists thus have additional arguments, and + there are also additional lists for LMUL which can be extended or truncated. + +## Code reuse across targets + +The set of Highway ops is carefully chosen such that most of them map to a +single platform-specific intrinsic. However, there are some important functions +such as `AESRound` which may require emulation, and are non-trivial enough that +we don't want to copy them into each target's implementation. Instead, we +implement such functions in +[generic_ops-inl.h](../hwy/ops/generic_ops-inl.h), which is included +into every backend. To allow some targets to override these functions, we use +the same per-target include guard mechanism, e.g. `HWY_NATIVE_AES`. + +The functions there are typically templated on the vector and/or tag types. This +is necessary because the vector type depends on the target. Although `Vec128` is +available on most targets, `HWY_SCALAR`, `HWY_RVV` and `HWY_SVE*` lack this +type. To enable specialized overloads (e.g. only for signed integers), we use +the `HWY_IF` SFINAE helpers. Example: `template , +HWY_IF_SIGNED_D(D)>`. Note that there is a limited set of `HWY_IF` that work +directly with vectors, identified by their `_V` suffix. However, the functions +likely use a `D` type anyway, thus it is convenient to obtain one in the +template arguments and also use that for `HWY_IF_*_D`. + +For x86, we also avoid some duplication by implementing only once the functions +which are shared between all targets. They reside in +[x86_128-inl.h](../hwy/ops/x86_128-inl.h) and are also templated on the +vector type. + +## Adding a new op + +Adding an op consists of three steps, listed below. As an example, consider +https://github.com/google/highway/commit/6c285d64ae50e0f48866072ed3a476fc12df5ab6. + +1) Document the new op in `g3doc/quick_reference.md` with its function signature +and a description of what the op does. + +2) Implement the op in each `ops/*-inl.h` header. There are two exceptions, +detailed in the previous section: first, `generic_ops-inl.h` is not changed in +the common case where the op has a unique definition for every target. Second, +if the op's definition would be duplicated in `x86_256-inl.h` and +`x86_512-inl.h`, it may be expressed as a template in `x86_128-inl.h` with a +`class V` template argument, e.g. `TableLookupBytesOr0`. + +3) Pick the appropriate `hwy/tests/*_test.cc` and add a test. This is also a +three step process: first define a functor that implements the test logic (e.g. +`TestPlusMinus`), then a function (e.g. `TestAllPlusMinus`) that invokes this +functor for all lane types the op supports, and finally a line near the end of +the file that invokes the function for all targets: +`HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);`. Note the naming +convention that the function has the same name as the functor except for the +`TestAll` prefix. + +## Documentation of platform-specific intrinsics + +When adding a new op, it is often necessary to consult the reference for each +platform's intrinsics. + +For x86 targets `HWY_SSSE3`, `HWY_SSE4`, `HWY_AVX2`, `HWY_AVX3`, `HWY_AVX3_DL` +Intel provides a +[searchable reference](https://www.intel.com/content/www/us/en/docs/intrinsics-guide). + +For Arm targets `HWY_NEON`, `HWY_SVE` (plus its specialization for 256-bit +vectors `HWY_SVE_256`), `HWY_SVE2` (plus its specialization for 128-bit vectors +`HWY_SVE2_128`), Arm provides a +[searchable reference](https://developer.arm.com/architectures/instruction-sets/intrinsics). + +For RISC-V target `HWY_RVV`, we refer to the assembly language +[specification](https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc) +plus the separate +[intrinsics specification](https://github.com/riscv-non-isa/rvv-intrinsic-doc). + +For WebAssembly target `HWY_WASM`, we recommend consulting the +[intrinsics header](https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/wasm_simd128.h). +There is also an unofficial +[searchable list of intrinsics](https://nemequ.github.io/waspr/intrinsics). + +## Why scalar target + +There can be various reasons to avoid using vector intrinsics: + +* The current CPU may not support any instruction sets generated by Highway + (on x86, we only target S-SSE3 or newer because its predecessor SSE3 was + introduced in 2004 and it seems unlikely that many users will want to + support such old CPUs); +* The compiler may crash or emit incorrect code for certain intrinsics or + instruction sets; +* We may want to estimate the speedup from the vector implementation compared + to scalar code. + +Highway provides either the `HWY_SCALAR` or the `HWY_EMU128` target for such +use-cases. Both implement ops using standard C++ instead of intrinsics. They +differ in the vector size: the former always uses single-lane vectors and thus +cannot implement ops such as `AESRound` or `TableLookupBytes`. The latter +guarantees 16-byte vectors are available like all other Highway targets, and +supports all ops. Both of these alternatives are slower than native vector code, +but they allow testing your code even when actual vectors are unavailable. + +One of the above targets is used if the CPU does not support any actual SIMD +target. To avoid compiling any intrinsics, define `HWY_COMPILE_ONLY_EMU128`. + +`HWY_SCALAR` is only enabled/used `#ifdef HWY_COMPILE_ONLY_SCALAR` (or `#if +HWY_BROKEN_EMU128`). Projects that intend to use it may require `#if HWY_TARGET +!= HWY_SCALAR` around the ops it does not support to prevent compile errors. diff --git a/g3doc/instruction_matrix.pdf b/g3doc/instruction_matrix.pdf new file mode 100644 index 0000000000000000000000000000000000000000..23608f361087f6153a1b21ac8b30393962670efb GIT binary patch literal 155399 zcmeFZXIPZUwl<8SprC-DASg)%Lv^15Vnz5z%HDeQVJ7OOdH+vTy+SSgS>Lf1|Lz+hP57Q8r1=WR0JDVe+T2QK>vbmYHv7Cb^LGLqELPF<- zsKn0-1BC_U9gwb|rwf%BvFmkn7Y8?IQ=q;$@ei^{qyu;@bcXn-=}r0|+|Te$=(J}Qze>J!QfjKwu$>BRF%qb`8RUh&R&M_WFR88}@Xp@< zx<5;W;TQ$7G1)orB;L^XtWSBvA!J$xrPc!NlxlTXn7%QFn@OiuCS7K~k)ZRvadT%l z$$Hy7_L#P3Kk=2(PxocozOG9@cB*fduUoBpJv`lK??s=rN{Q`%6NIu?NLA9GqAtUU|EMX3!j2}z~oBTr?i`?HfeM2J6 zyp0EIj>&E$HkZRGsdB52(ua)9br5DT77ISX<}dt}u;{WT&QP3Qb$Bqc-M>72e;N81 z?FWXdm0Q74>$wNX4VS56yYassN5Yl&&hdvi91c>)S5r8<`jnr#Q}pxn2|<5Egl1z` z)`?X~1L4~nZDB?QKIK>MBJ-Tt!ny|9~e#gvi^#Ti;LMe zF(bBwSsbKB%Z8TgZ8KUIu*bv&bNb$WPTyK^Zems1J3mX~JB*ErWQEfVRp+J}dTynd zDgtq43l$qK@!{)pi5M^}b5DhquQS7njC5DZDCSahY-w1+wBcc_jDe&elIQBY4MOnQ zN|6%Y(|6YW)vn?aUqjE0lmrEW+O8XAj-&^(g}l|hg3Q=Kv^7Dq_*%)(M)npz!kj+2 zetSO8kaP)A)$*_sp7N(M)AHcsdiKq17`}Ay#JbqUewZDA!d%fWwf*qCyvKE8O_@E%=z-Iw@ek6y)93#bViB`Wec8& zed?b(e%(<@)Vc(*y1f$r}p|h-K>GOUQtft zj?=2yx@jwv3m$2cpJLow>(z=LkPn_7WzT!=xa6WMbwU%<-WkQlq2v4FPT zLhKY@Gn^XbShm!lRzK{*5HDwO15q+jyLC-bu2`}-^f>3aQwjR3O!!J{-sB1|OHjIh zuHdgG)3!PJi@6%?@v_m(PgGm%M0>2brKgwnV9Y+WQ`eHJ! zrp>V*d2u=TjLQ4cy!$sa5x%S4LuYk29AB6oQStHO(8T=01cbUpM4LRt4 z(Xh<^Ah^DZoVTw_Y;`zgxoE;zsj&8`otHFtpG#wZ@wiI@QdO1n-pa;)KOd|9m*>i^ zb;R7$PnmO=*ABiP%k{Qx=H}_Uj)y2K62De@&wT#aXw1uWJ(JyGo>F^scW(QFv(=UT zKQhJV#iD2p?LkFt4f$Q)Nn=GuiI9(++P?4pJ^`dcATm#2Ycv4<>;R6xIfJhDT}?3 z**UpEKK%I*@k_lkQZ*>aAX&mDdRr$g`jfA3kj&BD_b1@y4iHCBYFGJKDY{wwOzp;r;}cSrx5A0yECt*(b6Ux6urIA zuxr^$84JIF748qw=LCnbk^DK8lSDSOw~_h|!mV_yA|l*X!#P{!2r*f~wrs ztug!H1_PrkkPTSOn@Zim+Yu$Unsr?KDZ0~od6QAD7#vT}`%dnBZZ9Nog;&26;P(4@ zeMxUq>C&hRW!m5#FUfR`*>D|56z0(9*;`iQa10zL?oEt_LiYVfy9`&bXMja-R9A&2 zVR%PtsLPRcDL%{Zyoy3VQ&-UI5?B;+B z?8)I$xr{~G$6nB&ySh&6*x$9)ncx|+sLlk#0?PuI`fyNO$G+BGJBEf}{`IK1sF zR6a|gzLnv+K76M4jjD+IlkUaIHc2pEeB~KkIhL30V%(@_aw5JURyyAETvfA5Vw3vh zugRdu*g)NxPD)=oXgRsdE7Z$_w)y^A6X=DNpm$ri(Z!Wl=>AS+!!tGoE^U@3bl5rU zJwMNHb@oLErLjGScV)m?j9&yxUmDL=ZRMC^U!lgi9xjqT;@cK@9y}RNSV7|YS?$by!+ME7!RQ6!zV$vMl=KR4SnPtos*UyIrJK6H4UJqRyq-K^ z!Xu)EQuX}?*4Jk=5nNrY>g&^K-b$HHXOhx6*+yP6hwenjXBT0}#cUG%#2;6pJldoM z+oatt0t0JHS<}0bW@f;|`sh}B&0mkJajyI~9kOnH4s?mAs2EX= zGA`$p!&hLhDp8;4GR3@~Q(i|Nr1X<;1G3w4NxY<#bDOMW+8kURNSrM~5CxU9C08;|#^;MB4K5r$LSe<(jt-q$&Vs zvE`Y&EB0`#osNlGs1|e;%cXol^TlRC^Bk8WuBI8`(Dem6{OVoS9Xh*O!MA~{EYGRs z@R?xHy>VT0@>jgSw?Uk7s9s|4u@JWU)3|wu%3pi4vkG^s$4e(`PMUqHEWbZ$QzX6n z(rcq9DwVfWr<~tDO;;+g;kz?YUh#b;>V`q2>}q>jQhF*dCNUtLqME))n)wo))U5XU z@Sq=N)UeA^({L9!DX#}N(U?!vN-x&i-8batGF{j4gN96obQRdW{CC<*D!%99XNsf? z+_pbl&%qckm^Z-&cawyL4Amx=QJ)*~=N9 z>U03><#5Zc#V%W5X4b1Tvw{t8ZYWUf$DoeNV|Ln#~UN_DhGlCYN^|y6#yyIHaoB zI#ZIaB#_Q@JqHF6MRo1#`e{8bhI@uV=i!r6S07CEznql&Qh?o$pFLSBHSX4N&y9+= z54gN)^>(*I9r03a_oB0amWp{fS?|mY+}0@1em&QVr;yR7*S*yfv1%8%{&PQO9uMTzVh4;-#bE<|U^lGKp0B`!(pN%s6~|8m{+a zd3;NRMSN=AUCq}|O%tK{!_r~N%LylUM#bF=Or#ffB@LK zi^q3m8K3rYJf5zQ4q%l+6(fCV!STCDPU*O;2jn=}4_#%wE$ZbCZw9dHa!x~HTeH0% zWlKcO)C5v*fbS-_g{)9dhu+c@N%o8P0(egue`@f;!O{nNHEWi@xtr%ak=fd zrFONWtN6hPhDdw_W&rD;?=+Qlm1k{F>fqqXI~|Tw)wUZM>nQ*mZd6;1rl$GRPzS!E z&xJ8^pXD)}i;mU7{#DdF>=Wtp@2GB;;`vYSdVt(i?_RiC*d8VJ!({KvI8h1p{#Csk z?2}E;O&j+vf9V$XWy-gkNtLH)&SGQR_CCgZ+4JRIbS4bs;>F#=h<>&NxjouWL$pz} zD*KnUvu;GuUVl4MHP?|B*R#d%egBAH=N6r`slyFK@?v$u8Lh13>M*R#-6-{=PgE+N z51#hM7`MM1s#9{&$VmG>*;bv-w4uBI_Qqk&|Uz34D zKb}!9H!)6o@o64it!{Npy;^R#h^(MG_XvYI?dNj=`~5zb5G#s?y6bRl;FgqX(fX~* z(c;Vghm0*0UTZv}#kRHJJn_m0vs=H(Fn#2?qxJexXa)1ZJ*H=W3Cf=YDE8WFQP7=^ zGc4=l$G(BRo1-QFStLU*EA_4hfSGFY+`-WYs;*1(UiJ&CujWekAtnKFp^TjgUbgRSd36CL%Qs4RYtolAR- zVZW4Aw&yzd*6#XC@xzxam&68`(MB-(@9G9`*>p?*E*TDVB%2oG`du<*72Y*)>fU1v z=E&x+fHP#vEBl&c5}yFkN&Im<>w0rN$fAJ{5mBmaPgInpZ(_^c9Lk3e{RldV1;~9q zH(M|3<6H63DPuxZBzMo(qfUO7fAJm;9NuFPwdS zYoloQAZubqzczERkFlhPe~a}w-zv|namyn}Ib*oXhT z=g6}L?guO$yWEdC9XIcgwmYlChaj4O$J80#^&Y8Cr!;)dK>>W^VKRYkF}-(~yd?4Y zjg*gM7s~5Qf7~vJ(+8%KMtOrgfUB`aM_78W0)QBOXZVAtsunzKiHjmhEYBmizDx9TdZDdw^oW^`f(`V>YT;I?ytp278+7B_Lj$u?6| zJJ)fyI|>JWYPW=%bQ%~tb$kraxZk+b@bafkT(0|8Yc&ja#p*E16&7@pND0aH( z>2y# zz)SkV6|-;P9@DD^-@BUT|9N-ClQ%XAV~xD{u}CGEFQa9@QY7a)&SF3JD2!uormM9G zX~EPJ0!1F9nR-23UQKcSg;9Ls~wv=?SA~@g`D7A zHkJaFlek(lgJ8drKDP;;=HZQC5aU;;&4k$Yywz$B+7tb8YW-^cTnkEnchuvAUAah} z-Wu8{>Zm61y^)Otvc;a&+WeZYHMVMb9rbTJg^nNn!bS1K3S>O=09Aal8BL{(I0ly@ zlVs#)vRi9Z#4MMS=|dMh2Biz{22tjFi~K3~R~Tg4BrI>K#5i_|eF?9h%b5$#Q1!1D zQ|_SrE-}z0$bDP#ppMyB0TzsDd$Mlg^z1~~ru)-8!~KF6=dH(eUP<%!FK7pAGJJNM zpYFDH^vl!seQgQ>NmEF>DX8D9Z>$mjtlcd+GkJs?D zxEOe~&XPUHz-Kq>^8VNehu}{|70Hrn72T7ME@9%I-D+Al;Tr0&kEmK=5*%!Ahpw7`Zq_ewDb5E3%tCu?p4@(`7PGoc6B+cUz9%Zq~8j~WH*JH?~Bk3!$J3g zscieu96Uzw;HMbu4;eMyF&6AS?MRuUYQ7I?+cA%$D16#FGabsffj@VnsgGG@;b;iR zqpHwoTnLu`)V20?_xew2SBXIS?~QD1Kf%~weyeS_KAzOL%;6S(3r=lX<|<_q~CI}Jb8f){X*p7*a}^uQK4 zE{>O;eKuEGLNoP9Mn87hrh>1cP`Khk(HHWftE1+WHN-zBNTHTHt#bd?7rg zcQn~hFu>~MUHwCprR%vZjQWcu8{Puv-z{AA_Xm0Tx8hLi%qZ*E>!jRbqR`UH2SKMv zRZ>@8A#1dVio*6xm$0xip6y~L;KX0w>UefGaXEZYjaQ*s8h41u} zo*A7t#C`@Y#XZxdAGhe{;hm$FxY?F$@4dpc{!{5!6W0npQbs0!wxy@|xb^mT*RiV$ zcj>*3o==`dnj!dZeWFdrIWX=k$_x6Im4gd{wC_4ldMF-t^8ZLAp>L za!JTk#rMlbqQcDbW(#>=j@bLw%P7|r$6E=g$GYji^1Qk{%Id!<-dTXwFWOkr{_2(~ zy_P0_q$s6@>)l{2{i=oa)n$C7zs{YFE~g!OM#<9AgXse*s4^veX_RNSpkjSYpBn9< zTU@_R(&Te3#djosZOS)Sj+S#u;;n3ao*wDzlEH*z6yWk(RdYhwEilIN)v!Qlw4;By z&qYRpB>)0*TG|i^E$Wt}oo~SrTgB~gn$wsnl9I1#`u@HKygPEN5tFpDhQ-4_c#xgW zL;8?lgsPnxFI~uS9-N!s2zriU<9laRQH4}Jhz- zoqFW&Ikq|#!c`i-b04(c^@atmHU|A1>Ys@6${%yI5rW3Y-)8S};~>{8d-F=%absF} zk?K<3*s4P&*%3U&>q>b&=ZYF+)~Tg)hpg2#cn?xOUqr`tUsv3h$GNNc1`PEnA*qE2FDz=ePbH^!1&>_N6UOiJJ=Jxe34mzCy2RNZ zMH_?H$KRUDThSf@!PX~Te8}|5H0=>lJNFA9ufYer8@BbtSmzx-w&eE;I{C{krMUgM zB{oGVCccX+^|kf~o(`Q@tgS3DGa6h>3$9zJE(J#(bvqvWd9J^yoP6v&4xiKe7=z>Y z_tYs1U-9UmXfcnfMc$&mh+T>M)LJ)4)$VS};lZw0{sYtymV1n-+dJ*s&x-cJme*;azj1P(f@m_WhLK zdM1kX5av;(jKHoH{a5b0^mktSjPqRDL@}#7m82h=x~tNozi4mt>*0bL4n2i zo|t0Nn6H4i@}5}8&&;N%%)^aPt5LbsPh8@OFZ)$LHW8 zVuE{cOK}HXJ|8f2{7gu=3&&8lOzB-JJN91~2<;pk2D>DP*IyL3|NKdp9G*J0@>LC- z;Wb=g+Y#_JOC>d4Q)lu~+hG!c@dFm8W{@MBkoL{1@z#f6>e|3J_%H{#foY*J-u8jl zRp3hi=fE?D4~u3K;24YCUgQt#RKw6nWWOlV1!YDA~~J& zI2_G8uUR68tgz}DpwrJvlAPa=AITE_weoWvC1nO+#eC9~Ub$Jv(xRs75w1=ie}8=} zzoAQL>6sDm9liv|%*)CxL!_%;)yP9E_8s9XR!S(j=%E_GTf*+JHYzXCfBMi>CxbIx z=jVzxtCKZf)o>LDzvZebR8!i-e*H|RtnVUiaGk-irxW;zK;Du;I2wbG z)&lVRqkbB2HQ@NT`U$l9bXF}HuHc`_NadF*|3~D>ttj% z=Pb7y9`OKPuKX6=+NewiHfxWwU|Z z=sOkK0#-H_8L0%XPRDNG*Sg$&7Shr_fMJnfMtgENV_B^zjjxxL&zlVYeZ?#zrM8&C8vCpujegvvTBpB-%HENBk8ISRB-_VQy01_e+C~B z+(IzrMeMM_FWvYkKfaT8_z|;W+DBojv&T|_cCMm%5!IqnXQlTnh5>!jy6cjaX(MR4 zHb+!d$GHlf#DR8c)P4(T4wo+~6efR$T%;3xM2nwF1pzSBFqaW->;yj)7yv@NQFTfY4(FDF7BU714%l`8g*2vzG!1X|EDjD4Rl%L}#TMD;!X3&(Ha*PKxK{njO^zNnsxemjREQE~hC z3x>3_WnDpfa|hh>K2ApWuk@DS3?)_uS2yq5I8J5d5B3G+iYW)Hnp8dY6%lgkU+oLY zT^qF;?68#0Dn|N{)pTt-<+D$HNHWkbVs&N3zuQV=N3HWs&~@1-_HCa{GxY@^P`%xPN$6t0TvCI#_^eS`>7p|BOxM}TIi7!U>m93xiKRT1P&X*pF z>Ali5Tvz1pSJrMRn^v4-;A()HC^c12JYS%s@4@HFiVWBouUGbE1k*%DK z2x{Zbkzg_KI9oa^4%Vow@8McHJK@SYdorCOO&~`CSxVz^fL|oz$bOZRN0L8Y?HYb_ z-<(3KV>bT*rrAJ_vUFCG&)TuK)@@w2i_>O(ygo;o8d2kBnTBar=}IU9iiKgC<8ve& zN^^3jM0g(KEDwHn^L>q32mN%1OM|y;C0&-r{b#@F(sv6aVCjYY74m|Po-fKWt@mld z`&|jEM(Nm1zn|^#AN+0-aB-jBX-yr>bFBb-*(n2Vb2r9lMY@N&1-r}lH%(pXznCgu zmc>6|?)|Y;Y_Qnjnu_4D?*8w9P$O{AAH2JCulZtMlt4c30 z-qQPq?zSJ|(&`r|u>6kRg?#3JpmH|M>#UXUk-GhTwRsykY|8ZY&H6cYYDdG!XxpZK ziM~40U7OdimBK$JM6u@TTNVu^xnq}x4R z@zDUcH#thY&<$(#9Ubj3#2;y28;^H+jd)alyns7XYs1u*?CmU)Y@}T1o zZ{#I?9jNukz*zC+;`Wp)4j(>Jwp=>uEil!ZS|l`FrF^t8_v#VQg`ql7`DmC`UhkcO z`@!U@PSdwDfykdB(gJNkB_Fump6)fUU69Z z42CWvpkgUs0u7FaUHbqOc=MFp!HI9A$%fkN$gN1J@l%nPzTcGi2KM}Cm4Qbd%HA~L z*US@UTN+8Qk1|>Omyb#?G=mi~s$0PwL*#wzjNrSjj;tjgOnX@yKWp(Y`qjAwUozdk z=cfuRvP0)Sd>dI(d%g6ovt}jFYQZh@yPK}<$i`jQGtzHvhGq+{O7rm#2x#^S*ed%Z z#xoY_J$Y^r%{ApMO`pu@QGQXr??Gpc=96b{7zeXXE#&TrEn1bQ9 zXZ6am>JN-URj=y#q}M;3cy&%~@KH=v;p+9M<9(NGUwQdh6a_8xoVY6DciD%1l+j_z z>0-q`;hlp#rM1dZk-wP0+N%3UG^8Rq4uhu4z=)DmGGo%226oUwt&IERk^7D#cwp;c8}B zv_g~$k40~~eWuH~R$~sFE22r~NWs+Mv5rd(slkzLBRLQo5~(%&KeNp&6Y@YqP#Tkgh3nyh)%J^vT*_ zUdcVBUayiU8(yKF<)`q*pq)5$rSSKmu9_oCHJLQ-&zU_tkW=HP8ZshPYzUQjAmHPv zrdU2VM>Q9j(U&@0Q~E%KHe9%b*^{K|6C0DR&u@#{bhM1W^0Rm*<@$mVsPR(sm5K%> zrTwgBpHkLE){DIB`?pJGz9i1v@NC?I_7lzLiLP!J86wng(*b2bgl6qq(UYZvR!KGn3gYmHz<6M1o1|gAK1I=|NTAt_UGf#q9MsBUPyh(oU^uj=b^Eq?Bly zKUy#~?p@HdghW}Ub%31>e3I^x85Hy!$fi84K7O5&G64uY{kFzeklfL3Sgb9A_3Z3X z(HFY$_VEm{-3p`9Ce^&1c5^R^3+QZvLkp^FnyX96(FaV%txY8PY0FeFG-hv`7m+5h z>ppeEDH4v{7(VUXmfUr|J^==5{Twu)Gh5(5dV7OQ# z51Y`A;=vxr2b{dQvls5q@)hmvM#huf{{iIxBs^)yll}a`#aLXoYpB_(n%Di=4~CUz zWPLpL;xRni(A39${LshnxRyJh=lpesdM<7vVmvO{ZHpnKbf4&Y3#OT`cn~Z>Ss| zrZ24rP}wWQ?s&9(>PBb|{gbY6fuLFX%e_xlKPNxwY!7Bd2w5va`%fIDj@FuSHTfRd ztL%14rk*Db**jU2nvm!PUovUxY5?cZtB$^rwzwqQ(5VOxXRmrzq^zIe4Qu5|asFAB z6Ej!at!;`pxM$L5GBwt2tG_3q=Eb&msxN}W#=&>iLh=6(4!91-wG&H*XEnqF_ zrfDtEZ~Uuq6&K%5-Aaqg*PeOfr{HJU4vY^f589C=(?4#l{i^vj4Gn@YUs8~rFYs`w ztOW!Ox z3fzk$K^2m(0IKx;$|0;SMGF_cS!Pw#|DkQlPvLvc#RLu3qcz)$_+7~grdCAJeEeh` z4a*hOaQkBJL1mB%MWptFgNv5nLGsu18k@|V5!zZTs&T*Defv55xaoQf-M-)5p*(#( zQA9U)wkj_FgMNAQ{Lx|qZpLDL&cc3f@aaTiX!>U9Bp4a0GfLkHcCt34Cj4-p44FUb ziT+_?64j`9V&?uq^d_2jr{(u%=XkQa;}^CB0&322lL1u@N_<@m^}a?H;|^R7usKCy z(>?wN1L3do4>(DZ`?$%5&pQG~kR!C?axbtlV_>bOJEQZ777C?`?%2~py=MRYkgIqz zczl_}C%R*E;94qoq!LL25T-|x@cakEb~Ek=L(4NVsHS|rA+T?2Xy2b)hJZiNHz7(O zx(0%Ha5RaDvL2>y}a!)2s1)7ad3Qq`pOf}eZ>}zMe$EJ8wYPpSehwOLhJtfWXaCvti zGlNpgE0(R=RDd~*o{qUed1ZrR43W}i%{`-uhEMWeI}=rSuJmSUu~w)0(m5x9z*B7d zET39EEu;UA#dpI31GngWTDcq%M8b30osZl(LCEY+0>+i?6?bLhS?;C%((58^ku;L-OR(YCMk z-XqyIVG5#8hy%Kid;*Y8ux+5PBCu_X@6zcRK|!J`B9GjY*J6M@+mOTni%;8#yMm%~ zeU>egpb`VLT?g87)SQ>EhfB};(^$@_7eMcy;BNvK6O@1>Doq4Jof7l96{;`CEN2#! z*O_cQX%aoGqGWf|9V_x^S@!>Gb#$NNl1I-zTF%szF#_SdP#T8wFL0CoYK2HIZMS!i zPLPx&**8HoVBhV1V4LryyMSe@z-4pJdNXI6QdhUMEtaR`E1iOEWj%RC;!6ic#9HvL!gxcffztyw)b(qU zBVQg~L5=(t#UHs}dLX1*Uoif&S7^tz-a{aW??!?kUIhoRg76oYpVG%2aK+o07{pIuSk3`w71qIv(r z*gT?n6aI<@mTn3szi#KZv4*GMvQ`9+;nDIZV8Z}roUzt^VG;~9jv{JIm#ytFbbz>U zV%!JXKQsyNRP;zXF>|sA^fD|4j?MBrUjNPr!s3(B?qbHnf#=>OKdn+lskmF!ZWspnaEkJn`1(~-FL zmX+I`t&_Dol1@UE0enyZrAxNZp-Mvyrk0qqqh>E$=~d4$OuXXFDHlsm4Z_qHhd-e7 z|J=O0ksvA(tU@KP8rVTS(GG}VjbIYQum+l`hqv8|J+LCCW4eNQ07DSH!7wp=6YPu# zut@``5J#b%@th1(CQET6#_mV%wY;6g*bSTR#Mn(}5@R>_%!7lb`O>B2znN$gP_2fj zT1aOW$WcfJ==q1SM3C{%j^V$&!dZ*8i`+A#eK)#_e77y z5gzOT&R*j%!w}+kx@J9xj9@QKhm&sKIf#ne+L3TBG-1RC+cb?O*$WUBDwQ80z(Vt} z@pI6mVE_as{Vj)cH*oo@|Tnk-AtBpb*ifAzUZg zH1oegt3IJR9Uq}f=~c%0*iBiQVP22smp}--orZ5?agd7WVwapU5+7zqLAV0&t&ad{ z05LO2BL;hjuZaPhfO?4m8%#(H*j8W%{~=(z41#Q93t{wJM@d<8c~SO584)bsPyTEfV>-k@^Rz-z?nUHOK0H+ zlRsOvv@b6!zWcSbY1xT3n8Z=2G^J_ zY^<%RxZhnKcUZNT_L%NCF=i9MH1Wt2@Ga4(|0Arr+`&!OiXh~%1OaCnd1guI-2ADk zBmtdY-`9$O8F&7NKJKK26EZMqCP3y3XuF+s*&uxn+k4?VQoL5>gO3TRNy^t#`=d=r z#tYJYamIMEix@iUzlG#^#Fz~h0_1u@=mXONo|Z)js*gQ#f2tU5#Smb{+8_8ICLKw+ zqB}x5IT^(tEsxXh>=I(YpQ{>imCcqPb@Ml*xPQq;avO7TMv-AL< zaeT7FS??nxy_p}fGlTRkT@m(K7>+-NyZ@eu{XY-JA=HQ9Iq9!Lz0ol(0$5FP2PHOZ zc9?3%!vO!y=QF4^og>C#ePA+ee}($rfQ+699@@4C(z(%%ghTTu3H={{32tNCB4l+a)JDb7rNRe`+z1^wJZ-GqILNfc^;p`ZNA} zNd33#``c`TdnU@#kyT_kcc*@sl$<0a-TMYSv6~d)T@hD6 zGCg$|z{_aNKGw8agKC%oD$g)N_&x&?bt0Gt6*QQA!wMS2{2h(gdh31oZEnGwmp=v| zTY%9KM2rp+(dQojolu-qWCEEwF>J@=N9mpbX}fHWi0cbj3xMbt`yV9QNy-587ZWf) zDQH7X{%^BvP<1bqCe+YicaC@4$q3B2|0-3UIHTu}NH_uW71)J%vtU#xUv3xn1(x(` z1)jikg;+&{xyJ7cBvjD|b&r3jKLo9s!&)HNd6<6~Y`d0MIusC}2Px4QA%5Y9R6|U@j{kjU~E~ zw*Yh(kvz)TMxH`aQ@D}mtR6cSPaVxf?L|NjQVn|=3_L|2qXg6pxQoHLQ*XHBbYpPeeZ^r{&{ekz~Y zfC88;H1574S7z5(0F!qe0I*sJR;IN(GB(T7vtc!ywIF`#hM%D;p)}4~fFeB15Z}P6 zIs4t;3de^*X&ar4Pzgg(Kq&mYmmkhW9N|9WY9qaehyguTxwn%9&Zqe+4%GvUCiQFsp%F<$3k@wgs*Ip8?CEhQ=RSN)TGzn*{@vSpCvLB?W zDV^YdSumq5ioPYzpJ(3F(c-Hub6hW1CGBs@_kSq+uZ^5vO$L@v$X~`A3)3Nt=XK8@ zq&viGKPVRo?(+!Po^%Zci{O8}27?y`X`u8@$UC-g!NC6ZtpFs<=qJ+OEqff)XNi|! zQZ_p6C9o`?ik9EeW}4NoDv!WJ}CT%1=*myJxg8!@|$C-kUjw zmYy_!RXhuUg?3W$>;Q!L+gD&Q?y$i9uYowRs*N@#Le{@4j^8q}2WVQJ&oP3CrfG;* zN>DJdd$0rrBc#=(+GT$w>R`C#Tn?Y$=?RAXg5_JUwWXIzLs zONgfbTpj=43flix(EhiA_Wz>_8k!*x*LCK8)!^%>_vk)b@-eEU0b|L-n(p1} zPbl{$H4ZcGwU{P9O+|kpcE>TSx)}NL2-Tg(-OTx!G@$v0_GI2Wm*DQM>G8_4jZ}5r24(w;q9a? zBNEP@AUieA{0^{yEblL*h+M`=k}rR>2i59uf5fBq7Li+m105~N_MD)# zy~FJuKjAAlpIZDPqWuqBV1WC2#?O_4pr_9tw!k(K6fMDq*gFun3N`c611&9ajxd|* z@7v?O@EWXC1`z~y*KizfxL>N;Q=fWQFPm2#X0_G=W%XvEBf;v(By;X4Q+Aw=ZD)e# zGaVS)uT4w7l~@49wtaaRLcOg9!1g2MBP)W6?v+W;4MyBHO;FeDP@t1&-V9Bid%Vtv z#+sUHfg_e>kw&g4M>7_q{!sTB@lm(Uk-{uRD5p5?@F;e@wy<~CnTUE#yCmhR@UDfh ze(nHkA2*)fU0*#}E5+@0KC9FActv~#fbiLhQU?2-7GTakkR6X^Nwjj`s>5KzAo+tu zrqt*gd|a*lQ_rKsp7Z&8pDiH#oO*z`+}Vrq0nE8wf(o1XAL7R!5uQew;+4tTJjVnB zMR$l8)(budPpeZZq+w7?(y&pc*xz4y;Y@1gj3TG~WZ{g&Ier_}`X~L{2uq`MCw44S zuBz%uuwpm^Hz8!O@@Nn44pFdsJQ>`d!&gkMRlygQU;_ebFh%M9=kb?PyVq%q@aFw> zp{;&64SM?*7FtnYGgQKt!DiH1TVSza4}bWdf4{NY{)KQ>F5gz8lRHnZ_>Qfm6|OCr zFfAn4ytb^tTDVzNK$n)LXV}GZi+$;5tX-Ms88-;Lm=%5liheG~uyXo~(S1>M92MjN zy_#;Px&&>q+z|_`WbMdIjS|1s66-XE!5tSsM*jrEWPl>R679&)_os{50N5D3%SKRu zj_&CMP8|O-krD%^BaBHfkVsgiA+r(eI4{^y(vFf%qdkbs)4oF!v%zvaWsT+nffTd8 z&52|5{*4o-H~kwYuBiJrQVeMW&) z45lHa03KwSpj^iuzs~&GcTyY@+e~tQ%K-=@h}X;iAZ<;*z2)+(QoXg4%qY~9c*Rx+ z7Y*4lK9pntbuZnJIbH(rJ>b-VnTlI<>{5&g`!sWo5uIyS_|Z|$*xL3#nj z&-y{Vh*TN4&7#=h{`TO@eM)+lOcpDA z+EK!hl*ri7-2&Pn_C=xKQ-B6pL~(a@_QHLQx9CHLBL}p7iq8!Ic9?ryZ1?DU+M+s8 zu6B~hWftp-+*<{^*#RsJh%SU)tRM-N;wA%~RPsQbUJ_8$Y|B6n0kYt)vp10p@A?54 zCkY3vIKYFenE_>TQ(`v~!G^Gq*qhP|;RFE(<@+K7J~;t(SH?{9_Fm{~&aVbZIzcum zz3*dj*e=Qw{ya_0?}197D*u}}Sd}M;BWYy|lm)~&g0jS`^xu>vt^q968`R~eNj$iK zVgTm>O@`hXQ1(10n81DF5cmSN$v_0PiFpM0Gf0*=0S}3}2;L=t!u;h1PR52}oR7+* zV*Imt?HxIsFGUJImMtG9mGDgB#$zc&IN9Ni+s@FhZMVJ|+}T?~ckx-MOoXfmXD+b! z=xACwHo$AKkTV2{J_jfJT}ca2`?*_Y-dKh>>G`QW5Y`fNwrG@)?x6F`ql@4E!uG0Tm5b-J<3L;&l1I;1B z3|^%JT}iiD;60J}dIjtuAilP@e64}}V6ZZ<&|RXipiW$HB;11YZGI9_N%odz?24`@OO{A!Dk3G2de6jit(Vi zU5tlKQZYUn6yqPRf?_-v3P41|b}|00z7vVq3Z{2@Kq|(=CMP}Q14hGA;oh|i3L~JZ zN$(8#Hk^We|1S#^XISJp^$A-Kd2jm{^w$o5wT)8u4BOR=q2+iAYVeWh6Qx&OpIkTw zl6=?LTj2B@0H??F8#vbPn!x;mqU&J=J{r(I5tkb%Iiq0t7#KPru>HZ%mBEut&9%)* z*BD^v9YWU#Jz?SQJ~TW~xU&K$7OhFSca39AD5Wi9jpGdAPz~&dz92I;E;?#Wx4lXw zCOWED6PT0FHhB$NMS4(>JJCE{K`m^UUcvtTwp}oY&mF#AawtY6Cje= zWA6I2BS8gzEa*Al*115_e=c|vm=Yk!K;tA`l|@s_2FIHvENSK3e1Jw2aVGeA-3UiqK+3*vbxw>duzCxoOyP)Cxny2O7VCSh z;Sg`#<)dBU@wR~V0_~8*S0mWrQ+R*Z0IS${%ijH$&6#wy_8(jG;bRNX2c3tPchO9S zAo>EP+};wvlnaHNACf*KraD-_zw!hmMjl~>K7f%YM_}ZQZ8P%V3W+5}f+zT+`QIM? zo3Rh~-u9o?>tcs=m?yIS+zkO;Y!HkmbWz^u7B1cQELuAm*mJQ1q_GCx)!%!pp+?b^ zj={M-2cT~cb-jYN(7oqv#5ywiP!oOmga;48Loe|SD9+DH?TK53QTjYE<%aUGLV33r zj2`>xodG;8(1UZi6Glw;5~%6X07*k?Iwz#2mv|7V>A+5M0P%Z(gu;PCf7P5!2+Efm zegolqOMWXea{~M>yZ|sbm>qyp?`;~*U3~%s2X_3=jVS6#?6{z6P@Ph8lqS-2izN z-mE>iRz~E&qOn6QrTdcQ;Gg>$d~*bin?f1@$M(F6|u0Qqna_ z8vGGuoBxlZY+yF_+P_8mpFzMM0R-zM1|xQsp@p0@;ZE&}oV1NLR)k@ZbRi5OGRdGJ zat+!B`QU=LfwKQ^p(qHpa_xC-mXPiGuOb(T0y9RSz;JF;VD^^$MuCBdmq@GyQ5EVH z3~~@aN(Z^YI>-&S@5-Yg<(|N1hXY7=<$sFnBt|;OLH>c@2G>err2iQ$%U;ts!UMex zL6ZE}8p!o0W&zukN`9)Y|F3&>S@K9lh8ad(U5HWS6m7f{1VxEpP60zNZ^vsL^8ECdQHQu{%uqW*aT zA!fyU zNTm7WU3d8%8bv^^h1b{Rbgc;RgR*UstMUIw6qpbz@MAK|?~#jp=8M9vJ+ ze-W58ME~_K8R-qb5z-^S-ZHEXhrc5=^e0}wHUcnrzD$iB4q$FbtPldk3c!ycU~b6IAYg9r zdz9bigP0q#HNcNZA%_lN?tBvF&L?2*Km1^1%U^VW>xw%^Nsqz9uMM6t65o@W@o7`d zjvkWftsUBok!}8BYRtpwWfxzI^n6q>!0E@kzxv)YeJdryFGpEIeXcq4e;NN%PPE$wJ2u;w|%x`wsBtL**4T&C*{?wbYW5oN54eQ)U1adXNe_Jrot z@evEk)2(lCFYKqsd-K^@z^rfJtOumucYjonCyKodno5Lbdq)4d!}UnuDzWgHbH;n#T72Q-ci1d649ApHura}*MSBJ>9fqS`e*N2wXs?L z$Z`7Y`}*^4`N{XE{||ZZ0aaDDtP2}ZL?s##0VSCTDhf#2C>cZ~DN0bW1<6??2ntG6 zqJm@vMA>A?Nkl-j5m1nfB1v)f#V+sa+3}sCCeoEX~dXF^l?VW*5Kg3T}7@Ci; z<_CMjjEK`y9JyCq;5b;*Lo`DfdGe%S8?-v&LzUjf_Z#lB@Wr|(?x}IT7@}^{M87jo zKIlGQNvw#EDXo1&w|ZivzIao5xA~+X-{6fw+5^S?2T~_J)$JvX@(&s}47^hhy&~mq zY`aJBg#D2^v~05Q7RD?QX2Y6Fb)Af}Ratr}#`0puuy9@Jn0FTY1HQrAG@&}SWhN9) zK%%J!%c+k26%mAaa>pG^jPLTCcZXiGB@7~%^uA^@mO9tb}&*-E{3>a44eSXTLC7t(? z%urp`10^`5feOJ?0=1{H!tuSeu_9G`bsqD!Cyfksl;c3T%(N2&RW3Rj8v^7q^sCZb ztaY3mciZ&qDN}?_3YUAo#)ck~?p3buj17;oTHRdn1ZG|NK1PtjZx&<->>EnIrsz#~-DE(W}gdVY8vBfi>U1 z{xB1)`U3QP8fodo^d360@Zqe5qW?H^#yz{SNUkBhtnsB|<-G0N^W+R@yJ(pRX8ifD zR1i|>#h-uF?CUg{Kfd<@Ni}K%FbsKIAA0PLz=m^zX`dv(R}(hPDw6OtifDOdAZ7-7`*?yxdpgI(RtM)h>U+xn&Kvx=zm7WZcY6npTm_dfq#vroJG2v8?E=IT?kOcgGfJgi&>3-;p)uPc! z2HRiK?D@}nCTU0%H`8Gg6ttMOfuW;ia6`ww#SI;_I`7eCAQDT76!yQa8L{h1YCbF9 zU}5XBY47E8t2#A0vr;!bXZXC`L6*fSd2vU*P~o2CMom>m0~zWY{_viMc*ea{HIPK)JY(Mr<1}w9is|(x}5qW zf^tE*vB)d&Hke>*RtVxd9111kbtx@^UN=e}E0R zlDuk)%bmdAdR2>XORKx+M1DpCO1-$HBPUL>^#8O!w>a`mvUJju40H=zkxxWO#|SPN zRSZ8^X+!fA*5oeW7gGO@L$pk)QnWpjeOkcwS7Fm&YSk>bllS~ z*7bRb{Hjs8{5$iea(Mi2)7W?A^kffCgO`6v7hh}lOeEBR_2TT7dnCazDdt6M%?4GpNgO0HI`W#bAC&?&_~>z8o`)692MAuqb^6T5);KKxKY0* zqRiUGLUx5gMuf(OQmjwPoia85VjAwm<-a2ak&&O!z>j{J1N<^4?w4uGh4@H*8H5%8 z3x4^ph{Ym>g-i^Q3i}R*m-_r2)8hybLfs zmt=kzaKWM#TMW3M=C5G`en8AQ4@E5`6A`8Xw~n3n?~uQBZ-wNWoQbtvuNM>Xd#TUD z{uC+12`{DM+eW$ti>dg|;l0wSS-(NxK9X)M2<@AkGM9K_mtD>*$OEaa=>f*xCPMd;d8?0q6n$=gTMSvc8jq1^qV%EEjDIc?|N7xHu&7YSSAcQ}X9 zy}CDZ#BG(m8K?7ogX$jL@Uto8)C)X^f<_PMzgK^}d7vnDjSf*Zx>7LXJhitZjf%oH z%DT8!vu#&K7wd9v69P`m5%|B2D*ZmnxQ@PWakJNs*pNeZ*gDhexT&#AVzrD)-<-Pf^@~Jl&bhGs9CO=g;R$g*AzAZH}^*psB zfcx7%vaFN5n+M#=P|sYjl@8%z5mU<#Jw*X%{{2H(NB)l;JhQk0jxU|(N3b43fYco% zbL2ds`=BVtod%=~&kxFs6n~db$md2qRWTbd%^(2yrWw;fx8}>T7+S2O^oI^d!0k&a zo9ktFMw!3vdLc8kEojD;9}}Y-mxi_$@#6wN?72#46H?lCS{ZG}aViJx2hMyzkF&w0 z%Vhf1wQ7&~xrX_%;J;bmr$aNZ0tf~A7$)PIa%XjO*Xunhf&&q`ni>gzkN_!cW#(^3 zFk$y$xbbbq?c}WM4_h9lYF9BI%8~5m6Mf9s#d2rDZsGiTS_bi0#>eh=^mPo;MKG8R z=zfDe=Jgq~+(it&s247(B=k~0ZPYJ_AYArw&ybE~JV$Go>~ZIGiL3Ai#>c`8Xk8;) zfzQ~wgYK~Ir+f7mO*$-_Qau_BDK2NR?Z_IP@H~8%>^!W|P$57$lz{@=XN4aBXTP&7$R%D{g)-wcaYBqqJgmlyzecb`8!qm(z46y z`lns?P`omBIKjY;f0Cy2ZJr9df`l;<1p2=^d8!8s$gN00o0k!57&N0l@b~ z*s(=~@PWcV=r~n$20wxy*05|+qPvdHPv*OhhbrNU_E-uaUfwfrP5Bl`@--xE=tt}4-u8V_n#L^WY$oMg`p?A zo*MW2VNN<#swBuTy=(GY76AJikCDd}2ZEI!^L;IpWk0mcHgG-+bi)&V*aVaZKwB8- zZs{jC#(;A-rWGC`pl#!+fXmC;jV==Gfl6fGRA3jP?$%+=Ghct>*#j^&rc-*6>9 zt`8qaN^n*ZQuX$R2LWuUtwJlusf5l#?hnqf+|kiw($aXy$GbYw(XrEY=GCSQzOQ=9 zuTA29Re^x882k-MYv9QNY!A}~U1Kd>lL>&)&~eJ?VUf|VyRI$Bnw%to2-w5->;NGZ z!ffBXw2KTJkpBZ9Vfbfy+kFlQ7;2NdTyipf^oui&4>Yn&rc|Y7LVm1gS)YJ?LEVhU zZ*qY5opgBCq;l~{mg)19At~+!W13bilMZh{3b@MV?&h?(O!gg%CE7GPcq(s9aR4H3 z3`zLnvjc6y0t@W`VxvSHmZBMgh+JHnw4T2nruZc?v_21EyuC1}u{G@Pa3 z{W&FwDS;yjObI+hLhGcI7}P-gkeZ2d1=p~=ppI)s=3*85SRuEE+xnVw%A(Sa@*8g0 zz3hA)q}Co}f0EnQ(Pc%3X;nbfQ%UyI!i9^;&-w1P4))Y@H3~n6H7b3no>G z1)wa$8HgP7pAvw{O^rpIeFRLItsSrufcnsU|J z2d>$g)MZz7sz6%#diWykeu-7r$Ng_M^>4|N{<1!~Y@#XNZcwdY)bsiUdo5FKB0o=U zdB({#TTgAdK0foHVwKbAJ|Fe6L7%39m3MnnugMFqf4AmS{%6+uxl*kaZCC9@pKdzI zQLyKR*+7I3=jyw%rt&+sKkjDDTYjn9o}I?KdE9T}`oS+mT7ea=yVsa$kNasXwsYSw z%NJojL$^EN<6SXZpm` zMxH9^&8z>6;b)`qt{C?Vzj3gai{fU-h{GSo502aUki9Us6@7Z-q^tJ5RnpICJ}j>^ z6n#pw+k|dSS^2o%JdCEKxOs8h;g4G|T7E^FfG~3-%kBW}&Ez}JtBb5aqqoZ>4Sia( zHA?QeZH4sY%B2mS__jf#jo&sXzv|lt3!GttaZS3KR zi#S%eSbeXD-n#Nxg381TAAgRlO!Nb;pxOXds`O zCZya;Q&u{^oAHDIk9q#idaCXzE()+g)EXA`!Z{o$OJLzQ?duXJ2+FVJ-R4!{M|eJ zm#NZ`8`@fztdF~kKHa4wcf^w!gl$$6A_AQ}cIj*=(*!cl|~C`TEn>+l(p1E*&M6pO)R- ze3J%Gl3j0draAq!zdE$=z?LmX9w;u~xTPpD{=s;yRw{9wu5yB!>TUrQTfYq|o61y6 z533c~9zU%qzo}liR4>kX{A|*ck(vV=eXV+oGwnJZqo1?jCmOwPM8tzKr89=D*yFubi@;S(HgDa)+oL(4Z_x>F0GfH%{);?(arA7-#OE<9njVj0u_`j zI^ZEh!!WqD>^<{Mnr1*-`uLYnw1t$&6T%w#_1DVaI_dWj;D#P^(sj^BeayX=M-QsK z*b-a_T4GHNy%p>!eed&pi!J(^nH^M?SJ4p;TCk_rb!B7#}&1pF#%Bn+>di zHk#OEcGqVe{Kl_fFG0{F&;UR#h<(`v`A!nNJL&YztkIXG8({&=`f;FEcT^ObSoXmnu%hiUq%oe?!i4S%v}yURP{6KP?4RDvPDF1p`ogH!^me!b zPGRXzKB^af*PKJpNHXf>^+cMGb_`)vrJf6rX>!(5Ve;}dik|9CSY zeWD?=xBAV4p;xN{M!WE3#X-VnyeSXOHoW=M+#iIsz!mUJnD+_czHDlr_erzZ8DyDH z2+l#ET`|%wcq7S3;TSh6Eu9ebU`Fa-uY1x7?tu4jGmx5fa5LE9-;!>Ih4Zc~ecsWZ zkiu{?IFZ?a*(*7<6rMAh7G|B1|FgSxewID{=HoMA*Us!B9HHtuHgJ6J!v*)Ws5Hs$ zYqrYzp(kpYZ$DDKRz`nCV&1RJl(u|^)vV>9vHJ|G4?p(da4;Y0$#cInPdjCWe{h2j z<|#AY*d@TUMNe&>wjmbclFvy*!d56?hIl_Aj7@D$O(3kO57Y;z1C7snoxZcr2UT%qSVwS{1B_rlb*&7uRNyKQ z7uG;oqpX?kf zEcK7R*tuOby+&qrZPivNZK0#U15E6w@GA(8%01q`mT`hZpJ+laHox<@vx4a&^Ru}l zCnwzY|7)ggQ_J}9)A0#u1sTDiSv%F#-?;7g1sZU@ik;MoPQ0fq=%>xTzEMTvH89(bn-RZHGMU&u>I=J1eV|=i zsp|CprfzI&NH9@I`Q7v+J{~u1Y&`CKN#p(ccU&YA436=tOh{W80=}@tMLh&jqE zvi~}DNtCWONz@oV9={Z@@wj2Dpp$zR;zw;-b-b4O#mR(R){S9t3r{F2-n{63)%H>K z9X%Y@RLCh_6>I6e0yfwv%y&oNXE0g|W{#bKtIpEdUe=vV435iX-4q_TFvV5cXS;ev zp4eSGTqgJBkdWTFx;RB|=GtQe{4o;Xvi1pTSQ%^qKi(7zw@k|2w>ERGjByL@9e|#} zYWXfeEwax?;3NDlx{pf+g%EneOGdeM)SR{_EIC1H+W46cH*oqju9{9I=H{F6p*pTX zIqPu0R^tb2Jd^ykM)4R31D|pK1M1)&lAbqzVe8RBnX@2JW(qjf#|2NcV7?Hmv0@Gq zPd%V7NwB()`URbM>u4z7cjtszIOdUN${VrOkwZ5FeE%5tvfAL2Gm%d&A+P2N#K3|n zpk3xARP97lE&T|bnL4<36f+>27Wma)1sQ>$a`3zenqcOQ{NX9XWE|>Xu6+t1KJMC0 z2tnFT8*|`T?N;V~*<1Ui?&8&HVs7a#*KXVMmN#rqAjlbyyHAC65B(FpM+!gokGqtBX;d7JtDzEC%&Eab@n&6 zRBax;u*Sq^ui!bKeVL~>HOG5)sIBV$Ty^O^Ga%0ikGZy9Q(e@a>JyqSZf33o$M?`Z zm-XDoiiK5G>-(s6l?T8m0JWk)OQ;pi1=JdWf2(dt{O%{IqK5?5R%$vSKo6LK1n80f z$1es93P{7nfO+TjlJ6N)sSbVyCu`Jfr4R7A)A{w(7!DY4^TsAT0z^%M5kJk9E(-4V z3406U*O-8sfaozbaYT=85t>Z+w^IgEqWL1ov%>~Eb_-Z8Aqp+f2E!e}8y|y{|A0IB zENIsJ*Zv+rd6mqB@XC4i+VMU;ga6-Qa_Kii2&`}U8oeOUmR~Xo_d-~Q8kkc8-v#M6 z=t4m-o3;?Kmp zJTkde`#mz*?f~bDUs9)S-ZM^v=N{r&4o@gs!Mct3ka0g)dU*Y*hy>^6XwS;}4$ncg(n(_bRkfm=f+Tf> z?}uPU+`=o0Ylq+haGVIAYa68219>MUKKStr>n=L$twzJH0Pr-$?K}(&$Odn=KvNNK zI)I&j#0&^#nr{rtfi)}E_xUUZ-01H40g|iP+;rxx#euu|R%_`5m)mGYGoKcKcgCFa z2T9o4{IZm23l6i*A)Xe-o1^c*heAB>SfL00%##M0k8<;O_JaKtIRx*`GiopAQ-79;UqhqZ&2u!N*AGP zK6xp8u2bOdx>Vk{n=`DHo|T)!zjnz!G8P0>8K^wLsMkO{AJ-u@Ft!;!;mo^z+%uG571$c?U+fUL_sLqc|KXO3C5keIHq3m)|hlG%0~EA zK4B@~2m+8#+u-U>j|25u1kFp*FQnnvA7@))*dH@$q(F-!cfrDLgs3N0TlK0DdlSn> z#+p@w$Oa&~c*Yt_36w8%AivWyP#yjj2r@w=a&<&w!R?y+eLOC?`o&r6nl!TNyuX{= z{?QTAZ(u+WP<*YE7Q=IB*a_m74+!Oe*=lS!VdI4Z{DYz*3VpIiktfB0drz6uUo6>-=~ za)oM6Ys5)GZZyXs-ub#WsvrD3E3@pstM@5zFIo}`}FiNP0+0dmb&fIuNd9-7jc!dlC0KDYU& zj9KW<2>m;{MhJZ;AoRHrfdm{nwsQi$HQaEeWId1N7L+5ROpQe=I(VtqVjalH$uTIo z;R+5V+a2FC?+ObN_y|cc%eg=-RfT1ZFeDxe8ED8Dqg2&zkoB+bl$0X}cS_2UBT3)g z8h`>k%D{4OqNhwCk6h2(4UsqG#{|HIRUrTFE4mlt7vrw}Tec~1>x_46It(Xy!VTp@ z+IWtR0WhvE?us!Z$J`?+0S%}a4%05H1pRqxeqB#{&)D|SD`o|RC>7HM?2-6 zO_;uT;l*f_^TX>0=)9Ne2S?rQ?GuwKOSSyIr5gM9Js%+8e6bJVxLP#Yh{UrulI-NT}7TuIfIZWzMArsq)B2Yby2WN7P#p&uyDAs;9b7Em+(v277?T#9H z6=ajOL&Zbi^sAE!Nbg7Ao+dB;$fgU;=9jx0HmzFd{ba!!mswnG|1JOYekS^M4$m_gK z6JR_ z;^gUcD}!;iz+<~SXUVzDBYSq5a6>BYARLgm)FtgxfW;pY*BKWB7*lgA4 z(aQCI`hKbmK5A(p#dw7G@-gRKdaE;D(_44Gd9a$|V;H>^2Z-Wzp<x?#S?7=yJ@7=1c(kqfU2vGcZ|h3?5~WIUDY?fiNBUIiZ=rkxwP4fqzn9#+TQip|PWf-+7n)_DlHumNg#-%n$9v=IgSW z+76h%-(k*t2v)8Wm}?m@x7tzoj{0fYJAJb9#INtXhR7Q`d_FitzGjl1HAl)^=5^+` z-(FjWDf8zVM_68@@(HW`b`hzT7p9W}az2)OY+$vLHA5D8!*&Jy+dYCV zPs^0fkr6-&Bc{OTsE<30EcObH6}5BPIK1CH?ZT1e@P7M&g$p@7(P^)kZ|}6Y*fZt! z_|+YK3q#MA5W$5ub_>JcmNbc>GtlO;@ChO0-gffbx_FTMx}E3TQG1Cb5$HJG*<@(Z zu_@;vPcyZmZC}*GS3s&8h%ktSf*$VX*(+Am2Xh7IE~2ceR~F5 ze)o=c^2ZOS9?OJAQp@jsE80mN60^cslG{IwebW=BF>-O`8$=dN^sp~M-GA+ujhJXT zx49zLI5AmVw7ENTm|S}#%gaJg@q;hnGKk!OjmveNEAmx+5DL479UbbV4n*6#Eo3UY z>MV4D_WST9bYKapr6me|fmXQo3oR_s?0joLCb5*Z8+7o8%q)1@^T9k%CM?o4*gET)(OX8|= z1<6cK5=J5{lkSa-LAO$$MHH#)s2!77v8lJ8-W))}dys}_sGtS8eQ?-7w$>tBdQ*~k z6v7LvD0O|tZuLbEH8(Rr8_fvsBH_3ng&r(D&x&@yzI+1qgJo5T$;t|*BCbKLvAJyLZDhr)sSX%JQet9a z{ZeRvi^tz1!NrSy6EB>__F%>hV_5loOlB@Pe|i;djjLl>QnDVD33!6}(4RA#?Ng)w z^}Q@QO=cP^kZRdktva@(4diEuuE>410+RihZaR2~fQcv77GSCNKJ7$_0@)DatUR2+ z4>|jDRggA+sCb<^s|_Xf?h~_P_X_y*dQMNQzo6_8xL2T~pZ;(vLJ$w4*R0ZmGlQi4 z$i(f=$tO|c6WKcUKh)zu=QZDI?fw1qfZa)nJQC@#41Kx-jzybPy zpbq~nmqx=Zg9=#?s^lYWYLUt)S%4DX?a|7A&D{N(Tf|?S#u9hfI~#wWCZEmuG80WI z!n0F|wVFdvg@g85&wiZCfTi0Zih9YCEcp)$SgSC&C`ZuHPAepwZ3 zCkm9yE{e@sq&~N>9cA`C#Uqz7n-n8*%>gxY1vN9K8bq$iP#rx({FEQKPgG+Qv+U|x zMm){!Yjtv-wuMQ}>KW5%AlE=Tzbt-j<)C#b5f!WJRIz&AiJoXERE;b#Z1766gDwB; z7E)0gZ2Ny+qc+9yKAsxDssp4t8J-j@ej@arFT?*nRrphJ{$DCOkVr(R6hR^p%@?8i z!w)rtA-H%{=0|6FVwUVX^X4z5y14&)N_D({Q>y!ibi%(^s7qMYK+B7+L9E)g{lCfz zpi(@lV*XN!hkFoz`wKJZ-vsvRI^PPQSOGJmQjm1*J^xoYHa{CT=GC!80M~CAM8Dd0 zo!Z;!K0EkYD|C;*ThVb5DRf2mF!W6ycQ+nPiDOhuZ&&W$OS{rwCDXups{< zJk^S%K>fd$&Hsrc^RHO;PeJ}1o;Cby!A$`QZotaJ%9&W!4{QEg1e3K z(LB|z@!LH-dm>MFY&pa(T+t3WzvGd z2-FvcynDU}`Vf!PTs~rn{>ONtACP>%k4r5|JP`?F*#hx>KF|3i5gk?y(OY)bSs_kN z7UJY&XC7yuDpy45d3z?Do>Mxn<|Ki!VNr@P5X+Whi9RgHt$?v%f1t^Xdksdil;9{F zzE;=yCYS92$i-8AC{KOd=?MB6kzXnS!>l!~K=;XJJD=}=y-%n;O&rzefjE~Y3tX2q z2a0tO*Ts*N>-(90@}sQtT4?&QHzQB+JU;%S<=+xlnyV&8@M6c$xxVK^Xc?WR6b+yU z{hlKOQA(>h%88;Vr5nztl~%`fi99!eCIfj6Z5YplVyA|u`%vQk>*Q=3{%2wPR4_sN zc4E_c_$?Sr7MAV9`8Zga4sAAR2y)w@F>$$tWQfRK(7{W%@31FAe+Q>sXzJ>QF4q!xl!6Y1FO?P-h7GSiZOfeB0QGq z$1et?#K!m`h)KS&b83LVOQP6xD3OW}!Qyxcfcl?`5B+~ze88Co7|-ax2S-{ecDiZxrMI6_!nW=_$>jjZ4HCg2Zo_L?b=(Gw;^% zp<-f>jnUuHhs{k3OUE$s6HZ<_w;XN=tWXm8S0>Rh%@0a8oyf>9UH=cH^aqtByZgO)|5l;;ucZLo zzDfCicZk7%pwmJ=A8`Y)I{yC_%Lbr96k-On8J-2Xz^p~|_n(PG|HRO_05-?0vqZG{ zy14(}V%qS9puqGvgSRde=P)h|UiSLW2HpQj<~tVB{~dqjhnQOy5!t>$YZ>Nyzu}kt zO0ma&>i}*RRE$KwiGqN&dSU^_Nx+MFpT2SCv*$JK=9N(7&`;S@9j{uW;nUWY6^GRe z&He}cz`t5`JbT@bycqyM%~+-J>mU`S*rcxV_8GD%GlnFf*Z_;pkmx*q{};rb=;_1evc%7p5q$E$!=U?bux$R@ zxct9IeE)sK{f|VMzurIpVkvU#MA=-Tz__i{m!P{<@zeHd``Yw312y)wqPKVsjZlCb zWQ|Zd?X^}zFfe0PCde=D0Tk%UsN~cMFSP(K=8b|nPW@K+>66w^pTyylfcg>x-75_k z=rHg1)OX7bn`>N)$3PTo3Dbc*1pm2*l4n>kUMzA^X8>Yy@?Y4gi`e!+3}1M44wbn7 z$_@WdlHcnfP$)+#pP4AX(Cl$;EC$~3toq;LU7Q$;ur&~45i{pIX+Edj z6N`J`=Lvzig#RuR7Pnxw35)*(%!ve2;Q{B*{dbr)-=<*DXNVRJ|nIt;K!L&+UO;7Cy{ z3joP)RoEyb*nxca51{^x4ZI=O=YtK$Ub-Q%ZHNSaoNe$cE$F{Q=EMb=rIgwurKK0w z3ZLKY(Z7P;M~&Q0Al`y3xksOmpjbGs2 z2tw~4ak|z|IGq{cbn16YEbV`tYJ;pCLT=3#pP3GfPiHZ2?%j$}(SK%nU%}JMINc1( zKmGH2U6@nC-*5T5Y?V88%)jx@n)YG{FWm7ub}d8N&R2B z?}p*%byP%iN3Wyy0DTc&ZBhCg4${9zekZ+-iaBl|cm2PZQ5*kXroNLxdi3V$4C@sB zUd!+KvB04_rT7h$Hz`FuZagyk^u#rbxhI=-;8$6QgvvAT8lzP5Qd(oG`4}N+dvp^5 z=K=J@7XD0=( zzr7vmHLK(lhP}L&M5r9q06_U8RXY^(-oLzA`nM1N_bcK4g2k1;me9Vv28-bOw0aH} zkA`(9AfcbIUk*fAoAmM;2D|YhG06Wlb*%uuS9}I1_u((8Bf8%20@)TvgGOuaz5kJX zIsU>AQ^7fcU6B+@xXpXtLFL2Wyce)@XfPvJaEpy3dQljQPLh(B!SAa4ElWRX2wn+w|IhEI{0r&wpDgq055)O@GSYtsn@=UaDL@#Q@bCdK;h_hZ@Po1# z6COS=vrMkv9G}|>bz6J?veGlP5oDQr(2HyFO&idUb;h#7^k>*w8JFO9B<6@B7vNoP zT^k8Be7-gU&zxuM!ZPRfbm+&h;YT;oWK!nb{>7Tq54dDtqTAC=IhBSRChwEJ62Z`L zRMd!`{to9Isk*KAm;Z5QbT{4Wq&}0T*IU{&t+*pBYjS!HwyrPRA4Rr*MWi4M^-%c(kys2VRkrlw$rz&STz2+fJm?#?dCG}wz@L{p%D$q zyEf50-nF(F^%|dAZfm$_72`k#a{K3R@;hM#(?uS)-cbriC+(w(JNHr}y zkha#nCcyTp!_g)VH;2uYvAbp_w3L@S2C0f|4_Xs=D-WMqsl7KqT$EaCY1IosErRQO zYBtPH#H^U}MPqMj=(QQy=@dq&ie<^}-Qg`yR{lbPG<<*1_A%S+a+2ISb~gv6+$EL7 zL8S{pYGPp#Z;CH|O_OboJQSv&I8a|hbvYnYIWN8c^$UfM3K_Diz8EVnH`k2P(7O~e%xHk00+|Iv*x7{(6ZY;a? zvqWeF+o<2x$L#O&Z5o-zLOf~$*dl0~)1^+d8*B4oL!%T8JK0ReNnNwhr4w0yJPhXD z&PH8Bc$X+$N4msK;RprAcCP9W{Ve^85`7Qa@vEVvu{|~0j(9ZZ>$ZPk^Uy2Vndldg zU2ez$noY^vuA#FR+UrJ7FbSxU$kZVS(LtbubpMxWlq|d-&T{WY@MSwxN_{~hm(E8g zvhJ1az|#4AMP1i^r_DTHVz2<$0Uajh$Gh@mSr)y#!_ORAnA|UN;PJaJ7pF3Ao=^Uo zw)-Q8*O#lk{MT~n?6(+iF4fR|=U91!O7xhK>4S=>tL2A0dHqJ>)8gFt8k+USN3Qk@ zw{WC%yccKC9Fr;A>0uJ?mPd9a%nVQa$}Br3k>bic14l3w&tG}Nt31I+EYw+gj!N>VZuixQiP1FFt!+(aYX#NfmB{T2=EtHKx-VEvRY@^jaB1dna|-jl zsS$bebsK@f_Of^=HURX)1~h#NC?=ecK`Kx!Hog*PnCTqx*+BO}kUF5)iEr$!>otq1 zG75AU*z}u}j@W!UbUK|Qk#~gCi`bEXo;ut6+ZWRO-1w|sw=wTx%e7|Wa9cF$d`LO@ zu~As~BdoXOdgCirqaHrlRV5U%*%Pw&RiC5&JlNAs(iT~%jm=Ouk|(?BjTBQ8uvlsE z)i&)1?Tg_S#D!s#H|qvWiiW0tcwL-6-}uL-x5NHx-I4O^n&G@)v5{XEgRA@dV(QIs z{f7Cfd)>c<$+}RSq5zkt4Ej^z(`AI(cN#mRPpl3{JQf89r@S2P^UKd`rzoPi z=p6T+=Nmg2`V3p^edf~e^tcu2aV!6LD37||x^l-|7ufkOaUA}T=+s?*IE0lxJ^N%S zw(nw7G944xbffWkapr*XCU;gk*07Q|$~UW-GD~yr_GropcwCdS65Vx0lpX#L%vNpy z?lP-A<~_rVxRx@fd0SR>1U7qVe>Z;U=&(3nZrZAX*M&+?|-a5HXve0YO2o0NbDHaY## zhgkHPCG4es1=`Fb1w#&RncHS`j}P-QTy5AKiLJfUCof%BcO`Z6!I+ySGrmyXITUuvD_gGDU;o)WZtxxB|N~K@uhn;GBy+%?zFf#2bI?ui7Q|FQvr)Nqh zF4QdMeAFZ`EiTrl-kSdNPF8DX{o>X6j!6FcMXQ3~1^ej{JO1g6)dD+xNA@&SEwB%< zZJ*0kTVu6v)of#7N%7lbj4o^MdBpPnZpQm45R0tu%7I)oNB>n(0ryGt#BY z&YLc*{HPf5q9gQ~o7v}Fm8+`FRtjzVGY;SCnX2q{P-t8p&NNcy#6K3_Kd4Y6%JnBR zK{7fe^aH_3I7jJWZsO_sK?NJ$53;R3{N7!{lH@W|kDO+Q>suZ87v?_{YB|k+a+lj|J&E;`h&4vz{FN)p;^t+>2}h~K{pLilZZR-W;?i?X7uMeXyzm> zCnA#zXY2cCTR%AZMeiI8W9DKz(;4qKW7pCpKW|kPXzNJGyET8DT-8>Qz!%9EdE374 z<)Y(E)O7ZA?N*1nfl#N*qGvPGO!~wEhw9=-cM1tt8mAd9ETo7Nh8A5u+qjjbOkbGm zHE6M}pDQM6`2-d?NDtqk?ie3UaugoYv2PepYD{YO3jKJEd$`N&W`4Nxr0(2!QAKs8 z+11-TBaWl{Kb|1(oxd6;ylqiaG-a5Xs;>B*@WOEtyue)f}n@Is4`9)hqcv;q^xrXLJzP!rtfsXn`(^?C+EA7)YhtF+V?B}A~GJIX* z`18~D-OZDK_)sql6YWz%UyeJfh{h1K%ECfo4vv42(-n#bNP zyk718p=#^e!QrT+8Q!l>*R&?|m~Kp19}cTnY>GAzfSIwAHcF>wE41F%ejK&V7@p2; zGU2mPQg2qU-*PYGLzwNFx}d<-idM$PS$fpY8E$nlxn6xuLzy;8+Rfp4UNcQYtDim3 z9Gs-un_z9De8Jdh*QU>v!epD!f7hV@GOdcI=60GCQ1_XDen)j}W64|AdQ3NY%2^7B zDk9@9g-kw`zcO~Fql|H)g1Nq69bKK1_}J;Z!Dr21To|0JJ|r=l9$ZLgj5q53u)t@! zZb0jT+t(1cisF0i{ecgdlhQxlSe04x#+B^EaDdr2+3Axr{S$aM*<<;c7*u(a(UGY^zV2?fdu1oNBqe-UO zsZ0j)$CvWxKjj-`X$Tx(^{sq!U1ii{1-c+;4NpR+yZZ3kx(Z(~1}xG~+0pU(2&k#jxZy+iG5MkAM)r zl25w3Qy=ke7=Uj?#67segLhnJJFs<|oKf!^D-rmlzx&<0V2ThaI+w=}C8YH@?r)P_ zPjsbY3=h7hs3(>Al7%sxVS1bA`WMiOhv+@2m}tw3)lw|%#+YX5eloI{W^Bf6%GH<>vJ4(~(;vb)N|T0l^!lZ1i6sSO zsvTSvvG>Tc^Y=d6x%D^Y*Gw2wZHb(TbzFVAMWwbkV26^RG@VQ34T)Y&zwDt`ft7wG zCP#SH1j*?h$(`4Ykz-4J>?0jVCu>z+cCj|jV~6=x4bNxN)XX__GI!ms+h{PYpsNrE z{itQfY3TM)AN8YB^bx)(K`h?o*EgaRQ#PC}=s*`o#q3a>Ld7ge^(8*bdTNXBK#sva z`?D$WABRn|o%?pmnk%Yl9hOl&tw1M!nv$+oZHotdmK_M>RjE~Tk+7~kulrstpd><+ zsa7pOBFjPely&V6rIl+&dX!XPxT?$7vkHf0(qp$t3eLEv$A(A{z+2Rm`jYr-p2G*&b6ML#K@n>M5A<-sJ;liq~A;Z=?C(A0bHr=rbem1oC z+Cx9?4TjtFE$%$@5m1Sm>V8CfbYZaU81tL-!c#9lF^rd#xIoXrx))p=!1R zhpI2N=gPlzl?q{l|Ie?7=^h3#XJVp#WDZyFK6t2_6CN;%JBK{i-#Ij05-+t)>(b?z zMm4G_bNX%iR`(wI>H4jl|07OnhK|lS#MbXlX{IUZ)Ne71q+!cZ*BWq5V zQPL%LNcnj?x~6-FN)Vw5Co5<|7w63&{1){@a7*`R%f<+rZ0sHV0yF*kvUzqide0`= zpG#2~zH;l4>wE!0V{A6<)-wXJxS-eTbZh7{tBJO&dQ`35^p189g|4d4wh4(YM#Wy_ zNlN;>A%{(gQAl<@qmYbhnKs)g>jAdZXx?aVM$Kr!D_}^i{lu@YU}IPOdwM!xB#b%% zmkz!4fMmNIc@Nr8khb-}?rJV(KSRIbv7I@SdGuLUR}wBt`>l*%vL5)Pl7qTycv|XG z17xfR{;+$W8vSZH>}qmd{2xM-6UHN2hq}aj?N`aiAMlx4r7_mtL_BNZ`y?=o%{6d4 z8y*tvmes_)nsnV1{x;Aj{E3f{1#w zKGm7}_9_*cFV9J8RUcYEbpCxqZne!-;Ye3^=G#uKhK>on5}sc|yd?c3CivLQw{7;9 zu$E^7hZMaKFOx8R+deqE^5Xk>%pXq<(OD>Dl zK16LbonAa0g#$vutybH*`~*Qldj4X_xVUW8-DH^rWl`VVU92917P`c{)NgLzrDl_R z{M?=^oCTf|VY$aLQPDoqy&Gljp1}M7@fe-`jCLsTfs8b>U+i8sWuh~9DK>rLF&6ga zY|3=kSwtK)rYZJxaKd=ohrG+_Zq-4K4GV?g+{u$V1}V*> zg;f__=8d)NwOS0-?BV9r4k&r6?IMv;FR6ol&h;<3a-?1@ai5H8zZ}?f^GG{+Eq+dRmn?Ce5(HZ|7;qu+*B}P}Hw8LyqyDMzJJ(_UlzDwqE`^^B})I`_O}1 z=AqT6IU5|A4=3-H8oYF+;z>B0>pKqF_0bCcPj=FIeVN)@96G^i;S``2#eQ{Md0BZf zm)89w1C}#U&r`Zz?{wr{pywVPQ9XUjp4Vjh!pxzTfYHZ`?;YOe7glqsrj*vIZsL?t zEpnECf6|>L{g}=@TDw_t$n>dP=6O>k;jghdb!y|ITwZH9tU7)-Nx#|91b?m;hM*Pdj@Np}P9&FQ&J<<0p6*cOd2F*bQsLaY zSvifm@acpo&Y@J|Zm*Ji=Nubr3{{n!CHBd|Q=&%Pt<<@|F|F(M^r_f$$7 zCM_M!AEB{qkj-nme1hlgr_!x2~pjFmnpDh7XMtsZT40>K*Pn_2Se$ z26ELLb20cQ-GbF4LOc4;J8VOyDN>m&CKg{@O6$Hp3Th0hIig8z_-7(n(q4Y5Cl(#o z;h)Q0Qgk8(Qui2STUyWGG0IWfzL?kbv@dg3WuUtvsqv8U_?eE-$Y<)MbwpE_?udPY zS$jkd7Z^qO41O&D&tve+WahKW=i!iN?V9E7-gS)1ZK~xgW*_uAUjzq!T3hUa{$q>& zqaWD0Hm@`$!QoZ|CD#7&)R?FII<9ap1k0La`O=lt9(WG{u48Kg3ZJN7%|dy)wX z?0Fcp;>N=umKy+{1uC|X^>LoN+w&>CXZtz-ttUn$(WNlevPE`%xQN^r0KviFsKa(s zK0DSgtZ#K^Ws65ICwTfcHuOP@w~bf7bDN8Plp<?-Tu`^j#-ga|abRkJN0bS<6k) zXM1DcpuvZ-6>MhPPp%+e;qu&EjAHv5qqZ2CPXZ6fsK_$CWaH=rsCM62d1FvVnMmfn zQtp%31D61n8y(l_1Za9bEe!;{Vcg{v(sYbrn>Psf?!Li_TrU|Fo7^XvsK>mlH#)Qx z6`<*MZd~D$SQMEz8{m36d|*TpJw=Nx%RV#1c?#9+3pTbCh5sLCZvj;7S#OzEU)9mfQ9}) z3u*Ae`lT75+&g+nwvb5u@=Edh5L8N4eJ z9$`e+^lkl9@fh|c{j#;Yfbx&NKd*kuX-RHRR&4*?$c4FnRsUPZ{{Cpl4ReW3u*5f{-=GHXKF zx^japXxiWg`+y=Nt+izdvCnH0XnkL1`vTZ2ti8;(FI4y>p!5AN;y|%*d}-j%QreIz=e{^oIKEU;@ryJ4(+qH?;tZPFT#tc7 zA*%DxD)E~djX{}ldb-Vk1Xgt1{3s}y!52g-03!1jM1QNmh?1Ozr14%5nG-6hydaXs zgG)+!K?HP=AACWi0w6MfL6j6ICXM%k$edJJWrO3!;CT0f>?SL@FK&Ne=!C zgE>q-C0&m@*5?gCWR6;0$NaElp?T@M`XggnuL%S#(=TK+&Vouy1O?BZ?as}W^pj~=QyjWvu3J($B5iDSvlj;}B zb&GoO2$o7%`F?;Bvj*h;tuT(3=*3Okj`*BsLK!r1L0m*jI2}#DZuflFV*NX;pV%3< zO1&Lheho&RjeVHHOH%AJ=O=z{n`Eu(^Y>-f|G2$+!)tu6C@8OKSaKJpV{NZ4`VN)0 zi~*voJO*m;psDpu8B`#^mfA${b~uSD9A=vVwJjj06{<`#chYJ@xj~-na#S!ul|x@S zsQ_hQ8l7<#0@}MpO5fT?19BZt!XEZiZ3@?$9djyW4S@GE@U)4fl4f4CtpNuzcc%nQ zHhv~hOq&QiDZpdE%>d5TUhyX9HEX5S+xEY z2)YdVZq!Z*&>^TC2O9Q}3QW`gR0Cwv^DG1>9W{KrbILsjg!ZLRRIw^vPeG!mD`Jx<^YVs^TNI0Dt!d4b39Sd zJQJ|QVf#oq{`Tccsw4sj4#@~aa7p8U&kFiV?{Q@#5J|h8Vx}ShmNNdbHxPjXQMtWX z%D6L!l?Y%U4{{d&Rv0J-bOE!B+irN7fT>qk3aa2Ev#;hI=Sbo}DsjcKA%OumqG6BG zhkGZ`51c8#-?1N%b@3NDGi<^i}C$QVFti-yNkre)hwvfSW%UqBm+hV$#Q2`?z+>W<1q16eI?f=D+v zhb?;9lHO>@gTc@CJ?ArOxO#N)nU}ax& zM`p)%`^nJqP}z-sGRQ?tRV0pz)y34?WmH_S2kx8;sL4$2VRV2m= zC?I~prCygDoXGW z3PS*u&zQ=15hF0HGvymub`bgbI57Bli_Sgxo|~Vq^*wiV-26D*?M#2v@ELdT{L)<@ zzs9cE2X{1a6X!ecGws785Hi*W_Z!`xnLjDE-`}6tm;doX{}QcG@+mI1XWMl{kdXM4 zlVem@Pj{xdf>e1_78HCQB*x7o4S3;CZ{ggpIFy4l(k8NNwzrg0Ns0;JEZHCFINB^q z@NP#hn)2$R{hg*LI7v1#BbBd*0-hch4iFs@d3-G?Fa^Af|B^0#21(_=*@H~Dj5I7q zpeDH1Usp-d*E^qS+9BB-{H^%pnp}Rr-!!~H-t!SIQH7u6S#bGN%Sn6v_m-0k3Y}{p zOV{oycDO1GB;xy8JY%d@c)?s@WdZ{lVHN@xA_r5X8wUUhqyLs9Fd5>C!(W5? zCiSGC{k>%|B*=IRVM-vG%K}L0THpkHzaENMdRhE`kR&n7kfi2m{grtszD>SnO!59> z55M^yHHA0)cin!!W0_h_{ZwfpfpBgKxl|H8U^G8KH43kVO&Y(Z@u)+C(?$ns@f7Lw zE|Rf8i=+K?7N8VRUz&-CVR< z96<@BLAjM*N>hnd#X;P^W=u<^$oT=QiP(*=rq>|ADe?E+$Ilp{8M$}G;g6D7izdIf z!Vf1_=y>qco3zzF=s6pMo7z41<8Zj`$#3@OcyhaeUQ<8c-H)6!ob|=8Y#XLG252LS zRJr>nhCJ?z+CzXrFb9DN5rIOI-T1}$F(qQGx?hn02MIUy?`Uv^^ARXsM)Ow!kLG`n z#wd)9+b|{XkMTq)U|bEGk}N-$U-s-Uq2Y{w)xh?+nj1auQZUbO*zQFbdU^~93{y#`jt5U%E*O$F$(#bXeHXlFmFtGIg zf0h8fbGjVBAR4#6)jK&K+J~MC{wVOJ^7PN+%`RMCs(`&gblL6W&GY&au>o+@n|-Sq zjgEb7Kovr&EFuL8Mh_B0AOriI6ufY5NE|pKQvR(1C=vv)5ZQ~*g`iW)z)OQEk_n%tdds>VstkQcE4^{v^%{zOx{{mWo0By)YZGgC_jJHgFokp30ew@y=-V1FaQPQAnG%b!N{0v2 zePKA^1Um*eZvF>}#Rf2XAZ}6zNll5p#La&sAa4E-Ql?~g`pxfhi`|aooa4v+MFB^T z5f8mNMMHMKUVm?R&0}%_$i`J9>%RML9MLG=N9V~iQP`v2l^%{z5k_Kqg5S}x=S3s; zBP4yDXqNELHiP5Fl^^D>td{>cufVZHMwVfk##oskgQ)rlCwg&TRxLc-xpd@GcAsN2 zV@t8(b@S-R8zof5oT49HEM_YvE+TdF20(zz?`G1_1vjm|QpTIP8+X(yO>YO!BmLVs94{MMh$mntA|#Z`Gf za6x1nt1$aye|~t@=*N!|(``BR#JrN_zEa53yfIkH!yQ+w5JUM9{0~DCI3J9Vz;_1T zFY_au83+_}cL6q!*E$)5Qqx}^!TZPw-E-0j9&W7tz0H0eZcvE=5M%GqYa838^H=kL zW1%gjtpE}+;F8~uM&5pp)Uh+)+DQxKfBO@TIsJ2HqwU@;b%#z0W_8FO-tWs%w&LEc zZ<59qz15tGK-J`G;?zfaJe``_FNM*(6ZvJSj-qER?vA1t-t}} zxnAQVg$@H0XywnQ)`C}@6bCV-OZF5I0UE&BA^m+&VvC1N;TIRKN0E*&m-VI9Tef}Qe{f!L2q)f?#Q|%YJs;FKT=TCl9N#CLKfv|Eih$(42VP}OgukIwp{dOV2ctd` z(#cgnGO#1JKSGipALD=k2TtV%41LF|UjVi=Zw;#w_&+}rX1|Cz&Th8L_xUG_3x)^cL8zA{ImR@hHj%zLOLtODc`N2?DJKHCGV{;HDQ4!X~(8NQ6wg>t9xqNlj1TNlo**kWR?ti0NHZ`M;+Lr@R`9-Q>w#Er1BWg8pvax2i`P|NdA6 zI1-cUvlL+DElj6WVOj*va3sl(!yg$lgL~>)88SN`-RrDB?i`j{HiMcE^tIz~N#-M3 z{YNoceZ4ohjr%`uJ)u9?uT09eda==;4DEcHKtVgWcL1&Slr;GJ;$)a~zAg5iKTK3^ z5!5TeNTo}+?trcW{i3=p^mpjMi=Fng z9Va=qBr1%^%*evd&CJ66js=m))cD_T1qA-p zhwI&=NvgUTyP3cI8DTpoD`RFv7BFm;m*z??j;5;SZiu?TgTy2d znbgd^+z^?h?TszXMgF;p{&STE#{J*Jc)4r+%wKG<4tC*qDb? ztu-;eEu^BsW3di=8>hyH)V>VO@4WN)L#$ojl#PF{>!M87P*OLZV;Nc53D%Zw4Co>& zbd5b67{n6k)ouN-J?(S2o#`)(_G@)LL4vIgEGn;T{wRRRFMa(6?iWH~1wPRx^>A70 z`H)u@kyD_b#7MDn&sIJp(=oPIa-Zt?0ZV3j;EewUow+-IbNuOfT80F64j-bpYBq-T zBa^&&Wg9% ziXsWLmukyO1(^qvR9J{zZIq{|@^p-!nV=``HvjBi>y67%$#)85Sl@;Y6%t*E9=FAo z1s;a+*4PEtIL&KuwooBuId)(!dVmS)|KvfIe-YseMg4!x%wMYd_nAp*M=-+Lz7>4x z_^sDk$JSbRoi^g|u9f>8Iw=Brm_<@guyv$2Os3J%P0iBt2 zHYs;DSfuRRj7H8|a-BS%w*^=~PoBNk77)4`I)7hxe(~MSxX-xu%^Y*>zzo3<(U=b5 z%9q!#rEQ@4bKlvOVu=pwA5Il?U4SxEQF=ze7DR9IW8v;j(2vNsvIL&HZ$ zbS+to19MIuwkMy)HR_4RB$g(=?2!tfSRM=ADI~7aVRe<50>;*CYM%3kVBvb+KlKLW1J=O@`*YaVVkVYB$8Aehd#}_cYTeqh84T;W6uy zUpMWj{(B zlt*4nJR+i)^Vo!FD7w*|e^h;BycPW6O=y`wxJOR=slB*kwqgzW*U^qR{^+1$WPP}? zsUW}Tr6#%lLAkdVsju4Af=Z3Y?Dux}h}^JfPMLn%2a#WZ+}5v-B!33K&#f6>{QPS; z@g{4F@uu?E>JFS`m4wIM=TFv-d5z;)&hVc%F}w)r$Qt7gEFb$ZdWk-5^Ie;F36Yp*Svpw0f2Jbu?dUUzDCKaO#M;1iqEX#}xUP1M0c z*l&79blQ$27?9PYd-iS_(vv&lIjFOHZPWi35_Fd)r7?&G=2SioW=nEQ`42KTb6ZNrz<^=X=rDXH*YRnnT+$QrRV9}$O*9&A|u=yVYCIB zZhR!I`*-n@w7)E*E+ktIJmGm8Ba4nB-4?^@2*Px@8ntmk{x}vQF@d>b43Rxc_M*KQ zHsgci+$b1NEIU=bDJnk}?19Vv>=)Y^{H~0$dvQ5KExv33{$^FIFj1m5kWWPCXa5!- zjq|(FF7IZQ6a31|FQacaQk)5zEl!|L*iM+1wNJ=>>e_~P&tZHb6wl{Q6K{V!)IEHD z_}opo?z@hpj1A=Oe`uWf=dVsCPfA%QsWI_W9w!UbvcGbF0v!uif&|(@WorGQ@hTO3{0is5VVisL43A)hC9m&?ZQD7Gw@nV;&NpfX_ys|hP4q8E~_(wGCE-)4q$ z@u5=PZU#7by0qVLz0rJCDhDaIgtXjF;`EE&0oM-U)iyFr3oa7;8J17htFONTgDjw8 zW8tvwIczZc<22Y+t4)Q7LT;|vIJ3hSa3&HL()MPLD2fSwb)_F=IQ^*DL%Kj;jQBmW z9JrH5>omISRm_}9hyiQa8t=Y^$(nB0=Hrvon8+2-@$vVV(YDOGdwt9^KYp}L-CMs6BU+dQ-Pw>}<`JG{4echLjbC(p&K$@i@L3kcJGy@ELdZ*lo&zZu_qHFb@P zgN3}d4~?~M*3!<(^xCPkVf}ydyyKl{aZm5th#QAWxQ|Lr5-YH!vAkl8)ZWpuG?C4I zwXxb!4Y#d2mYBm-HR?%XB*0HMt+qbh?q~1EpEs@4m;z< z5Q!qwn~sJi*Q;~ZtBjfb^!>Y-etC1o%%_?=SPjh_;@BR98sR`~DASQ9vUkParwQ7J zupG!CL&x<4P#~znp_RdLg#}o3g6Z_f@LS%RXPII8Ucurug`hX|?4{gExLn2Dskl~d`31KI~GYzEer7$pRys2-i>SoScI=ozxyJ9(jwSU zS8l&@r-8Dv9H&u8RVbgvo|E=PMe_H$T#ql@yL(4LjHY@-98Z+g>cYcS=0*o4Baf)c zO5H~4**5R#kjf96_2p&3w%?m6xtokvjG8Xr@AZ=9nL2U0IIe_P_%JQAN@RryB%k?A zVFvG|eB`dNu-!Q5Eq8atH?k5rOrJm*+GrIa^EJHULPUwEAYdcIaI8e3IVePqI=aYq z=X|q7G;P{7bnV8lRKgt{vtsSJMIW*FjkVpCq^zdL+Wsd>27jb=Bxnvon7zNSFPjPz zbSmd-&C<@K!}*Pg5H^bk*^)ys7$a@Pj%d7E;ZU-1>ikP+eo9!yJsc#WRK36JZ3^}LzE+eM$4M=I z+vAUCTtvU_b=}?A%Go)i(~Zje$4A!nrI-}wI1*u7ibj?dCz948g8NTV3dndI&LVc>Eg$vPQ1M(B(rY>$cRTbd7Wj=I zaj2W|5nSz<(6{qPgZBra3(^$3vwnuX*C{iDC^ge{vU94Nb*fOF9XPvrKFgmyvVM=P ztH<+pZg|IUe`xOBu)$CJJjtDq9an5Sjs1E!ZeUA z@pLZT{!rH6ecghn#qJgVtG%)LB;EF-?@pBla3nGrD}%65;}yD`To?pxmBuScvvyM> z;i)NFjj@E3aC0Lh(?%c#TaV|*^_{N6!|W=Hb-(Xe(E9Ccfpam7C7a*!XcXp)0xPOL zhC}qB6CqH5FD|_a@x{FQfj+sQA<{S*?M5Y`IgXBu)*Pudf41(+BO-6dH1@9gvZMmH z#va?1KtfeFyhfg_5o5lOV9Bi^+K;vm8J8`G;gUI}{1A3pidnT91{%LLL}?k%lRD=+ zsT)&YQ$u7+0An`p8mwflMO+M8pgKi>c!N)o6jMtiX-@LdxP1z~a>Lc8ZLc5h=d3-h zKpS3#*^izQ^Q%}#R8mRyzg=hjL<~D%FGDw*@I_2;H?PeX8dy8F!-aVG^>|ec+2m~J z6uwg_SgqwQE*k6f50K^29ASHf)M(_*OwU{%t?zt&tq@aIdg`~ZtF?VDdHVM2YqY^O zzq70~;OhW|_Lml&TZbWN%DCH`nRn?MZ5`gdgSzKGz4~oW=E#3Ht%ZLb!elukXBhL2 zQ%BE$r!4+h{!SWcC?X$5?Nk2kG2_I3V=H&;+6je0C%;r11cz9gF?Y9jw+rzuLb37$ zp3=FvrB+JMI%Vzq?`FW8LiuYaXZObj2^UUA9Wv5I^AXnvzQ-Qw$>t1*aFBk23GKwO zs;!fZ;pCE)Vu6zC&v`bJmAqw=^PCIezRr4n>~_9bl}HyY{n|vqfsRzV9{mj?i22{@ zF88l67sE%l)A0Am^2EN?>zF3#NXkn_*R>E1a)?}CxEznCpFp%YJ^Y%ggR87^ylBN# ze|_SmzO3IhVHnEU^`qUHaXrTMq={F+^`XJDa?Iz`P`@K#?0u}i&%@RACnLk1;4^}L z^Ht(b+-rgaZ-y{KKjblAWXG|n!p?rySpF2ePllsW6yLlU`Fb9wbqN;Y|_jS)E>)h+O%u+Xdoi4YWUcIT@+SVtN2^s zbuP4F)yK1%em{P@jlkrgh)St^XCAhLR|21o*ZdZp*R57sqN_UN2gZ6CTc6$Z>8O9; z*9_uIFf!N&wuSdp+X;VZ`f?QD_%>alPh?v!vZsrlMn^4Ky<>u?R<4id=$2Wls>Q`# zu4fnS*o`Kxu9g;N4Uxpk0e@dBd}=?4A;Q7X+i0VjB2D>*>*+kumtao*;W#u#v2(;H zt44?cCt!9yb`2@zy&*rziVf9sty3dnf>pcv&bB|o&8!Y{Bs>EBCJij^Ju4EoT_@t4jhboX6scQl&DhSJ8)_>E?UtZs<$ zvVXnKo{4`S!9thfjFNx=#~^jYa7h4xJ!YTX=?F_XZhgL|^mWlkn4)F)>vqJ(1a33pY{fLC_IZ(pdKBz(cVbS`&d_JAOfOo$IJ^L_0Vu*RgCXZv2 zR!cDNUhVRr1|vQNH8ce%&j`t_W&D)e_A9)!bPbV)Exq71zf}0_ff(XSkWU0;fk=mLIL9OHe6yJ$yZbmWwRC)ZoIJXf+%BYxPTKJ?^oNAPAj zlB6Eci|D?)elnJ)>K4IQIKgtq4 zEo-wZSknw0{5Lor5{Ea}GWzOHzdBl06{`6mOZ|pa%7_w}fg6+<(x$7U5i3uME;6Dj zDjOYaen9+#?mgMvIcUlNg6YF58(&^72F5$#6nX~I~CSYkU#Z1P(@-*jXx$$neTh6{#= z=*oc;=v<)R{mYPip9H@fLHiYBQ~lYikjXnvp2OQ`H_pNM3%?78Ba8RHpG5r6+}GvL z-1muwS~2!{3KZ~6v&!V^)=pYhe6tP%4kO2L)~Cp}<(F(7K(pjJnb*46=i9=OOz{;+p0f=VDjoZ8=?6^ihXG#z!ZOnrw)7y zqXx(24ECPU!#=Yz_+^xfvE;9Jv}`P^%2FpT{X|rXyG|4tKP6rqn6D%jy0J?)VcFzF z_y2ZGbV>7-QHDXOyv9>;2_hY3AViM%DDuGhF^G_cy8TRzBs+Vxy^KEaFm($u1sV@_ zF_iXf@`pfgmd{hk?*;+y{*|#`z9+7ox2H$nn23BFRNU^b%4`Z7R^0a@@m&2MhIy{z z6+2c1*0-UU$}=VtU4AtkNw31yf9`wxq4^cm?JOb8`8;@5Ye0LJ(Bx3om zgb?J(Zj9t@kRyAyq%65ec)C)u5PE$|szfvJw%R<$`l%W>%v#U(MWo)$w{N70OU9>5 zYqP5!>b9RI+xKEo{c5uBriBk|l1a7*f9rvE98OnqN6zHV-)emJ)+jsVyAJg)9TbT& z6|+=T<7F~#Vk$J1IgXoSxg0BNLL}}Rg1bt@QHmR$tQePpNH1v_Ci6%WJv!($nd=^? zSu&x-&VB>3O&AuXI#((=SF$#a=<-0!OG+U>&9^kjvn+V$hE%`N(Hcau@z})l^Zeb) zZ1@Dhpw1Zu_kmO++uhIgvotqfnm3+bvwa|Bkf^YOHC|~4@VwG${CP@;a)X5bSw$YP)GIiBF@^bM0p%gpRQ0;#Cb>Wcz?OByS@C~qLw!)QmGS`Dm zL$fpfK$x)=9%hpEpz7eeZGCavK@@e8ZEl=uP8Bf)kQo6@AfNKgH5YL?QKl+}vYNq$0ZkPJ zl2+Omcf2z`!l2c}TMgDt4&k`Vwg;I7EW+9x_3FV!x%N0+XA=G7iy*OrX{@S*cUJ#Eky*!AtH9To4w2otGUW`*2j@#ZU0tu^73~WfR%_5dS#8uNM|q(y|Mb8?2|_ zUicxMGl~@V1R|08NR*y7pwdQYDb`I_oI`WGB7O7xkIf#{K>+SA+?Bk&2AcwFvv0XY zrO83?Z(7yaj*5Hs{hzbh=cm-0x)CIj5?xJdYfN9?4#ckcwNj~u(mW?EeK@{U|ZL>R)HiGHEfs;T1hru41JvG53@0%=X?!EQMNWCZyLvujMI5RaXYAy$- zDz<)kJ+6j${h0Ud7&xzu-@%PZQj`)=(7o!;r2AanjUI`J z6Ty{K7Er*03Nfe@N9W8!wr8{*H(uN3?oF`5)a+m4Q~tnWSJMJ@HHmI{OmVl9 zF=^$naOl#yR>#Zd(b#{Y1mWPuX&>gWhTM>{lJmj2Qn`xT>wzqk9G$5p@!c4=cB{N@ z4uAgH;xi;Y1S~y76r@f7w44%xU05^fFZ!+Tu@JkLE9PtGS>LIiU7VVc4Gv1)LI-r; zN>r5*Y*49 za*?6#{sa*YiWY<{!5v&%&vK5=>QPx-rTi$zN;6B^qs(hE&Z?dq4GSk4i3_%&uQg7f z{cNVw0kyk1klM4pQCw-F4)d1!4VD;xDd7*=l8L%h?_BXm`~o_`L@Pgw%p2E+-t|~i zTgS9|^|eZ((xUnqdk1v1B)8w+e|?`9n@gS7b4X_&VeRL5+wilN%caWPeam!6QlJPc z4@%d}=%W+~Y%gBLj|J556WRzW6X%v5XFKleND?PX&^_sYJ4Y)_o2zI4?Jx{X|_Fz5DO6tN79j%E_%7bw{hatnhn zP_h|g={_Z5oa&+IHvW|xndnLPiO_<(Yrj6nm4(YpVA@+EL7tX_u{2D<=HJs-V?1|J zUQRbbIGL@RSD$YuuDJO>UN-$%5|p?9`EcHNxcIwU^fbe3iK&6O?MC}?$h~4rpvDyL zVh=0MUSORnKlirfF)qUUh5~yPu{#)29xcebESGVMON7@2dg_)7J&{vrG(~iDuBfze~BJ! zOc%GAu{;aeU@$u9P?)JX8NN~?`-yF+BfwcD-<7|2R#vX>u&+|qcH2%eBDrrr$q5lR zL6T=q)ULf(%)^6cs(5H2M(r19xRI8Y@K^zcVg9X|-StVOml{??9wUbg2jUJFSKa4Z zlu6VGj3iE?!W>}+yCRD3X%4KTd=ZCR)|KOB<2~(15e9uztFljlLm6n(BGy%vCl|tP zefZ?*C>YTnzZ!MGf1aKO3%=H)xEKXnTo-uWY*FuND!7rch8y z?>d`Y!bdT`apl(awenTS?r`V!))#xjv5dx@z~Kc&fq}2SH#;+~mMb6DqS9>d^&5>Z z1}m0+8=Qe~7`;tKcNAmoGta!F7jnMA)xvK!eDVwIAJ%an+bXfu&>u!!V&3>s*lM_H zbGJ!rDJe|su=g}HO9ks+_Tuw(>LIo9Mi zs{_jdBW_pU^cIC3ccxq*FVt{~dTbkfloS#al;lTQ?@)0QQq&a=ezve*FFDWMx)Lm5GK?&(?iyjrG82nfjG_SdTdAc3ClS;>e*M&iL|9DiP zASw1)LT~csLZ$EzH=gSA=T-y1Dw5^Ww!`R)LBE_F0ejIMz8GXaJT~DkxU|~Wf7y5V(!3;5@a^1%!&ww>=9QaZ@vA7pgs9jA zY|&F*-rJugyy32*cdP=C(ixjMgmCWk;f47Um(fif!25FD3Kgd^D(c!|1?+Dys$+)+ zmP6@=Q+tP+ET25qGE^m)NEQgL{d`wfBemY&=UnkRJZJ41JSMe0Pl?NV+PAviV)N;= zKF;PaJTV(0M16^T>(BM^TM?^2ut&dTt-H5$bV*kH&3`fc9)0FJ)%|yN?U5p_r0Oc- zO&CfU4f5GjJrr#3wCRAsSolk5hng*N$Y`~b2HQ8NN+a1_`s(Wi!mYla&`5R;P)Ry& zCf9qMj$yoz)?#rMzEw>`LSvjKyr$y*x)qkR<;{Xn`kmS(1pQB8fQSZRJ?=ydri9e1 z6EO-@jw6E^(j7X)iB!0U8$X4#8*Hbn_Ah`7JWyC?;fy zH{Ka?AuD`oO|s*{DKUB9&;_Uco9=REv{q2 zTle*-tU;9aH$^<1Ju+!H-ZOx&qZE^4ZvBu&o&3 z=(>Dd<(}d-Q}c?rev-v*`$^#8bCP`y+oA=$RHSxQ>)t&MH+5Ntd-4-G(P#L0=xQOA zNjPP}UCa6#a468ydoX`8o|^IKD_iLjV~n0NS7Ex57OoSRysd*~Uud54O;~rxeth-y zD)v(SRQd?{^t!WI{=1>s@<;vK);rJEWKFN+M%Q->^rvmPi*Iu&>69A7B0?~76_1Sk zwo>!GjBwVuT1I3WommXdzH84GlWOIX(=;U9w!GF(s&WfLSctWM_3&<`pm*FYyw#Y} zKSZ$4mB4Gb(WHDyg5G|TC3GpOua zz+i?*k$IzEFe^(+MX$yIJ>j9EsB%lzQhWrw3mcOg+8|?=MDd5XQo9*!_u9z zU5B+mY;OiD%|J8V#(&;d+s2E zk~;4=Ye(h_T{yrvwRi|Oq@ z=DMiwJ=(@ssrJSrMZ)BEu6UDgvwbtDE~MGpv+2Z>{|CRdKYi`%PN4=Ke@jmaG(>CS zx4lzF2bCK(h5pF&m9vEK=f=!_%}z;;Hgx@$wBmI_BDa3;-2Q%^)P{o~R*OFf=Ty`~ z=c&@LAfbH>MuCWeRdd0vN_1yG%4K<{#Drti`I80_F-KKD^mV&eXHqU?f)_s{J^5`) zu%?;ct@wwXYsKH+p-`U?K41FRZTyODd~}~|ODxNJMk3?yY&}KtIdts&Zt6+w-?_9S znK|TheR`NyHA|G;en)tUGji6LgT2Cls2nMtfZnhAF|IlEYT=PzTfco;~4m) z$QnKnHOkC)@~ht4NUWf~O^0);zrB7Q?1NzmHVErR8FD-9AyAkbp6 zN9)TKwAuY$T4VB+UIsgzPtWR)6WO-$&KNCpI6g&9F6_<^{;te4>CazzuT`&xcqI-Nji3Sql|L0c+^e!ne>+1Iax7h*6U;SDhkTKz)*01DMP z@gU*K!>8fGlohWn5>Dd>DI3X$o%Y6+52uo69k$HgHxm-pDS2&?;=K`>MTc8@~-$?y~AIhRDW$AkM`X<59f`N zaoHPjsG}MDK`7i;csH;75NHj#q(({j=E)H6c?~Er$Iz%-c|Quvu7#ZFBJlUGL-I?_ zpF5RPg3jw@xi1e4%UiR(pIP}h#Ij?DYCs-_RynWdeT6L_0}yJ_@o!gGt`V^z*7TLF z-?Rc}ML($eSmW7Kyp>Jy;PEidRn*I(5C}quAge)pGH5v?#dv4iQj!!CEN=J+mAH1x z0bE^SU&b9!wp#89>RLNFTEDPTtJZ6I3ikW!&52*2q_P`pVp^$Qx<@0~@bG+ljjv=J zX6t9=-{BosN-^%L21RFn8{~P2NG)U!S=(EDCvGsg31Vr9H|_1JQCNQ(Q}Vh3v239$ zJ)qU%C%Hl0(yl7^E_ua0VC!68A1Bdd3Q4&M-LsEYF{G#vum@C6A_r(?tKy>Q9F->K zU|e9ULE~RA6>|I#_e4Ssts|Fvqw$4EyIiax`~txa-=1SWe)oDZ&kjGArE8?Hyv20uF+7%CoEh^0l&7)n(r_$es z1p-^+4Raa4RVlxHN3K;mvwDSL2G6N9`L<`e?D!Anmb61&{X}i@2WgOguW{owdYB}7 zn7Hx`aS%)?_BL!kiWXercaxf+%e+#LR*C+Su9+?J%11K>73VR1k>SaR4+jE2=-Ds_ zn8UVGE6b~mm;AhpcpJKu^Iv15M#)J+Y!dAtim>$ipx9QJE!(o>2wQ;y&VSCXC?LuU zr&DMeV(w*|4cH-QA-QYwE2A}t+egP&rrifg57;BR&0Z@sHONc(xcAP$DX+upY&EQKj!DB zQ2M8vP~7cGOQDS|C{OLPK)nvAxWy9wH<{LK>uVq-pJ8RW3`?cD0AHNMH%9x~VZF}S z42oguwWowoN)gzF!0%EEF;2*{C}YWqVe)Jz7*$jMMB)E&lP6%&!?Gl$8*JZEF+e^4 zCPOB*JBgV(QO#=l8B&;TIfZ;*ca2njc3E|GF`h^%BQpQFOSK5^ zWrW9xa!V)u92IvrIzfEZqbJBLv5X&ay`gV_W-Mb!f}(yQGAaYv2cI4>x3T(rv#YA! z?5e|?vd;go%4zlYb6C$%$NQVPw6(h> z$W6J{YB92vztvI5{yNb*LDLvwLWb<06EN5!XRcpZ%c%OS*_E-0Pu0)y?9c_>I`~_* z&m(W*TeAa)JZ@ooAhkS-jXxGL4=9#5M?rp{ygDn3iYgYlbvwSrk?ps|>;^9?pRQp2 zZt1XZw_mI6-Kuw$B@d6m_UYa0#FMkq>DFeLYQ!k(y)MW}#n@ImkD~*kv{bN^Sj(si|WU(w4=*cirZ{1=W9)w};(oCM{{(CCjbluof8sC*+~ZJ!^maAD#ZqgLjf;n@NwdpI~eGcigUB7)RmP?mnInk@|R&2~=}CeL9@ljnm#CvTTKjZhSVMCzNM*J~oPM0dZgt_27+ z_<xhP!GSV}4_k{IP7?UXAV{@fVm9ofq8D9Ll`qV$$aFaW2HT*8A2`C4ZnbtbU>G zG4dv*pO@~Q>Y1~8;6q&FRZgy62)X%3dabL#Eb%)nHTs`}=nU#p@kObxzi5Z2zD}|6 z`%wT*-YqjcShB3cb9lxGmo{(>dnUe@)a}{F;8Ep z1(mgiibcWCa?1Hr0yu)?d~NBpWOA91VCxDk`PT^|m9z^>%4xW%glYmT?-*I$c}LpE z@XQJHu<=(9d$hi9HEnKaax!+|XdLy(Y8gEX@Nit`ACR1mj)`8iQV2c4R&4GriQ`ZY zQiV?r(etUwq+=&?5;|^p<71c_k|^*^x1KgUF-S4{#4;qIM(!6rbkQZAxb}l z{xn;szjx~y%Hc{vsK*R-FNOC+wK&K-CX7p4>(!DN8U6|OBwdLhwASK@=t?SFs5JwA z%zAti#8^K;b(pSmox^VlM`=ftw;iiCur=k>u{yj^KW4G$yQtKb7mh>vp2qfWFVd-U z44f&<2AWt#P1MGrxVCd;F!|@rYjet`)=uz6n>ps$c4bRQJjmr;0w?zP&?2x5ZSbq(3JTtZlKy5y7pVQ(oW)6yG)8l=i+JS=+%_!8Zi$xsAd%E)@k z=6qHixApLQ9uiU#H26=P z1gKd*6V+*0;q~6_Tzkjpz0ZW!-sCjVC4Y$%HH@#Xs5aWWO@#|}Q;sszl+C+GzwBkp zA0^uwy_$~)9R5c(_JM^?!IW8^&N4;sDK@!7)uq|%jFhwDQ21RudUCjQqD_!Pgo#z; z6tI=S@2f*eU%uIv_rn*n6+kKBBT86(>Nvp?kf-mlI{2aKv{NY6+h@Y;*JBcKD7K$0 zqJ-kG(CjpS&O#9eD3&VE-j>eWNg^#ZmE+}bDm?b~0z6$$Db>Wi*-vpTNZ@)u_{a** zyb%3DI03RvL-C|a<8TazTQAgY(TE!hn$_VNieCRBbL>g01f~s#CkS%LfD6=AmCoBv zZAcY)YVS|c*sA2i;GMz9_3rOAPld$!=I{vp>~UBllCknPk?qc*eN@0r{-A%+;G(vfuo%Hq7N-rS;)EZkU{Y@| z6v1pawqbye9awK%#%+Qjj1v3(`$hB?g;afCRd2+Yd7@0~*4fYisp{Z70dvcET|dAj z+9l<5WH|}eH{6;uc#dHvomq)kpnqwHxEZz#5_IaSrOO6U3(=`7L5`rH&)-yL1`cu) zyhSJ-BqM6dqu+QZ1phbHPt)Z2ptHf9LRehFrfSqIo~g$VqN7C@;a2Wi zW@{jvOzrvWvfbVN^&8~&?Ry&^QXu5hN9mh%+C9aReCn6ef8^==yC`A58TT}K9}pqm zK-FAe9bPU6Kxa=LHd&i)db2F6SIy+_P`{=AMZd?_cxI@yq(Ut#SVU-QYSuhIKf}Jz z_0%@&)-dH1%`n=WHpuDd+$Z0h&2cVkSws0nu0=Czda=AZ-tFvNjD!VBMEYHPRj;wm zg;~g<$(o=8IS-aB<3`Gg@oc6>1wJ^swQg>>lpCNDx_!Yr{tIvSEz4jf0TYw zT%RkNjdbalsvln>y}**BZRtoT5*)p1zDPbOy;c~l8*l3U%z&3U1y)^&SvqemJiGVN zgk?@vm-Ia&j0=xgUXe;oIUc_*pJ9M@6igSE{w%6{Eb`=}1v8~Z!eWkfFlKZag|cVo zbv$OQ8)CfDp}6h{x=D9`B2{^wj`6^2;5;I!Lf9pRmDk`9uZ_j~^Qg5&Vp^)^Ab^9a zjzzXi3up8p>Xj0f5>n-*=Ugex`77Smsx!=8bjwYdZv=%A6NJk)%f?O|wWUd`j~$N- zzL;AtRN#=uGPY@Ke|2YG`?_ys@4_YTM>G?{2}9`!$z1aOGVm=fYgFc7>r3%U51?m2 zQf_1n&X7_X&i1BtpE!r6kgP9SAgYJSnpoWoOPQvW^t}ci97Z_XcuBkmvlNv-Y>A7| zR9?#j>pSK_wJRDWPS{iKf7IKlm?R`@ds7&Y?gF1Yy}v~x(JA;?7S#8MYfU$LdH9gD zn{GEMJMw0RF@jcV)l*mT+n-K-tn>&gG`=9ANx!p@w>c|TyVsj?rH{ASTyN=Kko0)A zjMPLz{=Ti)sx5Iq;ocAx%6IgBEG%MO_Y^XHS92F)Ak+1=?vJ9XeTF+OTJBMvS1l+I z`l|JT!$%e=l%qkx{`rsETX1}qDg2*kZS2(SZamYo)9S0`9}yoBk*8Eom%ySOH)ZqK zV@#LSL)G1yg4R#@WcjZ=sGo!H(I45{IuF1@)7wD0!Zr2FBZ?(KC-V}EQaXuu<_q$7 zcd5B99~0WI*l@Je9@H)deam}L<;9|TXMyqXuG?}%tN-Hm`c{p1Z`PslF%Af_M^xe` zN;6Ob(~?W3a_i0WTc*4#&QXGyu{}1Zu4tS!2?(>_x6wrILn9;qqTymu3Cv{R zWzZHvg$1^wv+gXWfQOp=U{2{$EE8Pm_x}h>?58A=*)-yGk{_AL1Z0R37C8BqB(lQ5 z>AlDP{BUFUl+K#mXCy9zk^Pq`7(hAUZLU+El1TR1V9zJr8^ESu#1DFpn>L6cIyDa`=yy3<{xIP3j08qiVfcsV4k`*n6eQTHDd@O?KI9=4;E{>J!lCx1K2 z1=#U>{VK9+>KG(zDrK z1zir*LCyLVe#>e7B89nOfOf$gVX`=%(Wk&V%KJIL8Jg---i!66i0U1qGVqoFdJXzL ze?9*VC6HxNiLBEI+K9m2|ml7b`L-N*$igqRCbpZdJfF-rU|`+#=6BgA!2kyO zYVYS@B@gm}*H<0EfQPy-Mp6o2YcO!*Hr%PoS$wKN+!Y$v6*#Og)o_p`Z55p_u5T>55FRv)5g=BbP z(h@;(&vmbFu@u1{sEQzgRk6{@!=;RKtb?@Phu3JGA>BC=4ru*YSw~V3f@{_}o)=v3 z95~+G-M&$AmNVycXl*$Nv1pjYOxL9djh|Jb5MIl#O{Xlk}8OoZwq)Tm^X_UJEmBhFt%P-+FkOX@~DA6W~iXX(i)?ovc# zX5;Jr zq26l+?_Xp-OOFTMr~oLNCUN!{WC>%~D=H>sMwWygSu>FmP!XXMDD~Scr%=W$h+|PTZ2igx z#Z_zq^N>Vcbxn)dmVqZIu;I>Vui%)j01Q;l1A06E*pFkhQo(8d2r7NNx&HOhT2o>~ zSKxw^6)&!p<9Kz4H)G{GJ+r~U?YUPmcN2<3wrmw@N7i$iPuES=l-t@v*{%U!ue;Qa z%gxdgL0+Pw)ULzJb_ZhIgV1i^gK#Kq`+pkBCwiLJfNzc0hGlTm9!Zl}ys?76uPAR0 z5rj|b4P@$sm#9lY4?*&Dv*nxrq$L?tvx01d1tq&wkdM_)H2%psh#Fr~Sj1%kMak{% z_2pNl-{)!`I>_l(GJHoASUiRkB$%TDuddhI62*oV5oKQm9fMTvnF+}*zcYAGnJdA3LzhLr_bTY%1&Vp+2zt|wIde$Hl)k+t~Z*;2|CTj=l<^Z_Sj;(AHlpVy^nUJ1Od8yNs(>aJ-$Y8`aRUl z#2xg&fntg@zS4H}>k#r83}-l&ct@x;9s}L1S`*5t<*iT8+ID!CI8?Ir(~UW4Kl?3i z^2r!F8SRbc=JTPA(yFNMapN3UZPd+XNb{@Jo8R&;8sFOoqb84h1WxnUhkLmM2i*5 zfgFq8hS5hnviihDoiX}p#(v6CRlD$(Z;Dw~_qTq@?!muoLb?7iR8;afSyh>gs`R00 zG#zORA=Qq_nBh4OG;e=);<8YTb41~OM zQ+`vX!kWihlR7ikCgY?-8HaVnjcz7QH_dTVFE-B{AB}=L68zTX0@jiO*0%zG$vkU? ztK=5l&WHUyuSJ*O_U79CYK zB=CyQ#2q1qwHjZ2pq8~q;wRj zhCv2(oXSy-ZlxHmKwo;*xh&Tom&(yY0oLo4O$YbUYFjM@lKkyvvv-vl!u!C zEIt2zHK{DuTvtQaPDlcEcsIS&U(ma(*J*FI-BBXo2TVYz-uF;|BWI zzof?QfzSt=mR3~G*wTbEA9Jjeg^6?n8zVsitz?Z4xGS-&ko7kNX&d1-Gq2A~qC)DH z601E8D#YLX!L_I#fB@M*}@unqg8RL0;nkdS!hGoW(gaJ_@n7jqT(NSrZ z@~G??1-OpRH2c#VfQYnn%Q|aw_LXQG5-1Pe=!g<4j>M;qWgNa`ZQcx7Cc=n z!Q_`DB_qp8teU2I{Eo^$ixM~ndL6J$6f?X>m3pxwj{vL>^|}B>jIMF?BO-^WWBHE0 z$b~3Uxo{d9iwevSO{-;8O9DMAx%l(y$v2b1q?NXj-Fu=as^Pj~zsR3Pcb!Ihdwh9O z#YRnrdczj=#KhAC@`S^;Vqz}{;naY2_j7h2q++4n>2KEENP%88iz!aD~#qXkwN^P zLa-jD_xbg2NeEDvF`tr;EpVL7fs1d=FC7%5ujD3vFray_rd-G8vkFBIeo4WPhLT(?E|*B>CP0!s5N&}K?)W_#fEkB;eR8){A4ti6 z2{)J2w~oV!1bA`@HG5E!TpUYrTzKiBsE5!G()*@mgYJ9tFrEA_Jz)gAou{eT2zR_s zKxEg;4Vn2jQK|~Rj9Qme!X&u_V^VumcDoEGw(@kBRN&;?Za6F~!iC0(eO|9Pl!Hl1 z!{z**nLiC zT*->4B_X%J=RlW)cPI^<=Ku;b{Pch+>lFl>lYSyZAQQA-pxdODbnpzwDia)(oVq06`rE3I5pqM57SaCPvs)L8XV2KpnZDTTG_A5$@wAkRvT zhAfOQ4Tr3aE>l%)(-8~Rtx~k|42(1lhfIuR8SAEW43HY3(;;h1lSH9`y!!9;A2FHN z9D7dj&s6n6To^z#epFM-X$Da5^t) z&LfmTDOqSn9h1yJBu$k;$PTS_sP^C^(n$C}^0i}hgiEyl+F$Qq2Ip(!Z{dFk0mMu< zyToySm5wGYim_xjpQ5h~Dl3E&UBMyYlWN@0u8E{nhvcVa5*T+b-fP)LPCWtklpa{b zmMejjBD-$VWsEKv{)TG&*U1tNp$*bcnUPgz4zG5WNIpX-lR39dG*2>hCZ%?2h;Gt# zSU`FY{-4^HTptfN!-bRwBAX0mo6`wj@AHQJB)i;tKfq2b>3xzG<%`#4NGxEywnSP& z@W~!P#p88CZl8#Jv$PFH)j6(Oqw+!bQu4v>w!(1#PYx4goMG`6ePh1=?l)&YfS#}m z00i@biV{>&y^7o>*X$a1c-GE^TY>|h3Wa6%8Xcv_!D}+M&Pk1q83?cOCrHTJhqFZb zZ-<>M2sKQ0W`uv{T*h@hy}@|s)(np@zg|VWCbRzxB!g%T-yEqK@)bmawp&n~OHf=y zP+Xw5I01O&d{RpEDPbi39l&yUX*hMu<7~C4Bg7sX`mru^I}>NHlWhPsW5TyE`b*@n zq&L4tr_tWKaFJHa`0t4*!||^xgUfgwp1=a97~BdJli&pSdY6Joj#-B9YJ*8hSzDSM zudBhd*QG-~sA-}VD-Xb*#`hHuHGz9!0O)xHe!9R2*1%X5b@=Yqg$a|T6P_KW!Obpr ziF{cL@#cQwQNCl0dyPr9x>*LzOshI=i*K!OZ)HERO}JIKoq~QM^SEb_>P_X#HcC3x zRFmA|Nyo`sDWq?3kE)`A-O(t_f8VZAZ}=HjZ*`Y|72d zQ~dY_;=D>gI%)MvMrqsr~7*KL~dU{nm5as0F6@OJllRI%Fk_8Yp- z5W?<1YYb;+$W*XWwCZH)ZE*qnn(QJh(e0AF0D;%XW8Sy2+uIp_uF^> z;%hbRf}V>u=`(52iQXw?eS`mJ%UUBXg%9?2CE=IX&$R#GhV~YtT;G-!ehDsaLOsc)L)bR)4Vx?~J$Dj? zw_;O$L7{&No0TDChKEsn?q6xM&PZbzrhyw|#TX<>^lcDoq$KHMUQvN5%D)RVh-F6j zz}znblw=BZki8~i^KT9xu-8Uvb2AinpOfJ}fRF~rlEya2=Nad5!0d%VT^O~aqmH=u zH2fD~M$7YQ_6XoQ{Z+vY7$u$Bd%sW^!pGJW!nG2E+amZe{Ez|vf_$~=5a6|cTkzud z^6k}EW^Qj|%IStbsw0Bh{jYCCe4&_qE)RiZ@>j!(79Kv@7Rok02JLpr{>JI=m;PJx zm($ySFH80!%{1=`uY9Y2@92&%{8j(JB{#Qa{-aq<7?==c&GIh%WVuz_RU1g!j~Wi} zSVoUrBkwwmxMg1R^@3ytbXgU%0G~zKVUGJVMcmlAGx5EXI|PcXhY6;x&Q6`Zc+)o* z2exvNN)?zTisNEKhti)Ial=;Gg0+&pKI~RsK(W5Hx+|uog{-#uik4=?y}U&oeAXL#v)-+qGFe(VqRc>{~$b_+7^dzjnphT3`Of$#N>Tr2L` zdvtH0pXR<$ec@O&a#72MD%)~Zk=*dDEaIY#E$%^(T5yK8mz0ocJ@o6RwLjNbm(3_M zXETh2|BUrLGQLgPp+*>0PwiJZ!oHKNZi7|*Ob1{yD^_f~p@VtYT3O;@I(tLd)K2Ffxc-@a-245& z0!9^zhG%Tzkk)pDlkzVuZ7_!gD1?_1tU<#}=g{;=udA$1U$7TUURdbjX(b@!5Uvz! z%fe@_v+`E#h0X+;XZai%@>3u*+2;~#(3qY?lZhUhQiD>1YCOq0Df#a?{v`edzDKN0 zEUSdg-Qi zxyQ)D6b;b$o1XHgDbY)J-Wy`xn|j_`CLvQY@1oZHNmN=Qo`^7-H#l`a{NtEmTxS?z?fjV+u3#kWAg-y? zjGLEt3O!}TdQJ+qTVAr8G~sxZJH^h}oS`ltsnqPQ_`1}1SfX?DUWEG}_R@C1gzn=b zE#@X1HpTG@u@{pkv-m8+=&6SAi4kn$DakgxQD5Fxc=6?nSB6<-ee$% z-&rP(GU?>ZbS29ptHz0po=4gel9fPrMGF&=x>nudRyAd&g{V+9c*TBZS?nG_^bFN# z7mh_2cFWOX>j*Z+C>()IC%5`0K?8n!&Z;uP{o3QKws%O+<%pvFlrspV*GXZ4{~sMa zbI?EB+r9ej84#&*Tnp*uSR$o4rZUB6R>gQ4aMrubuXSs?OJkLtqC5B8CeiNFq zlZ7o@D$Mc*=0;3u1A6Bm(({1jb^%mzNLZDI(%txK0y4uI{xnSX#il_*Xu1o{uU#l1HKfwn-o>@xzqOe)+!YPSXS3# zWV#pnm+SGb=>3+X!H9~RTxPq)XVot)_@>&2`7cg4??ax#2qYyINEbUX8xPS5AFz%m+8&6`I^PZt| z?$$78VHR7S2g~GFCU*XE6jzYwX;h4OUSe}t*gqhBfi4hYHhKC4 z(gd1en$<)bV3feGRYBmEHY4%I^ViNt&o1KU?8f2x$7Rp6rWG=9ODCsT4hXadxbBae zX-FY{PAW;#aWa{#u)ncIk`YM|^Q=xQb3+~5mqxwN6RD<@TpYuia%TtBu9Lq=#@={an6UlvVs2+3yTUSpl%Iq&qF;?_%AKXDY8chs9QuoljHb z5wf~Lce-?;y23eXmKk;?N6H4s8B#|gf|Xbf^hWt|u%518!s>8%KIVXP3blKzrI3Aa zj=i+D(DV76p0c*SZGW9QxV#EKbAf)Ho)a>~q+LAm zuF6=H@=NC(cgCJbKOA2?U1&dSN0p4+*Gwn@(Y`oWMVbd);y(UrUVPhaoNpcN$TGF> z7WiEEEl6A!Igf1<#2wBQ%DCaAbbfS6C{0~C4i9Gd;G1U)KFed@;`qhbaGk99yHf-P zo9?ylF|&a0tD+V(4(&hP;_SXm&U5zf-1i*+L-oVS@7t`npkQ?CoIIu!OEtJFkw&9E z2z4pHXr#B0Ym+7!SKLqO7W1UXbmF*=vz6?S2jIsbt%IsfVWb!-B>*3G*|dB!lD0WJ z{_`+b(um|+dkL_1rD0SEKLq*v42*M_u3x43lk}*cvKtN^fEPq86t-de%H=Uo!cqyYcB(mSpCuxyLQnpF%<`y9?5Mjae4<+X39X~XX9(NZv348TAfv@xoN!X zQk_fj$u6_z>mnK_kxrE7ys2l#*#}QW|42%ApzKvN{XT3>Z7HsWdkXG#_!g8JOESD0 zBWPSZ&NJQgkjatdtNi)yZ|O^zH2WsjV>(e!B@!vBi+zxY2{QTG zeUPNT6Hi`c%W>4InkB3A>p;Hg8u5A;&s*7>QlvYPR4&7<{8M|Z%ef+JW0@fiMhCl- ztvI;wWc8@#MzE?isL!3iNeu1-;^BU}rf+RU(t$|jp>v^GTg`Q?AhFUzo3M`%Nhve& zdTP;0;O*Ox*1>gg^2?{Z?m@=KzfT;eawHpg){^T4CL%&Mhy}$6GwUk1%7my_x4oBL zWw+Q0-5T6CAKN*>U%*QH;BuD+*%?f?Fq{9q|Mp(<4tJck11NF1=bbg$DYJ-b-TrCp z)brYWs(829W7-$lKvm0?=+?i$Sj{)>MKI8GGF+a<-R!XS`;Dk9>2^*r^E)Chjp|cw zgvvwOyJqxT-ERd}5JB;-fN-3uq|CxrzI;iC5VuNqFZWj`X*032));|f6?Xj=fhoY5 z{mJ-ZGAE37%Iz!gv&bm5CG4l8!Fgog-_V{{+qMnK)fHOA+q=1SMA!hkX2^(b8`STy zBVIX_dDf|Nf;=B!AZ;6$jXwPq4L!^-9okZJRhWtW6-XkTX{d*VDqodVnBU z_}9=@@*7OxyZO;(qm3Z4u%XVpg?v?z7$m__4etFv+`c>vaptYae>XesZV5hckZD zCyKO#`dG)9lYCCtWUxBuBA(JXK3@57WaE%^IA=^NwqOja?B7D}|GfVj;Lmg#KtcV% zAIWvP0_NCH;GLlBJT82}1%t8vQYe3);-2DMr{>V59Z71HuISlwM2M3*IOGH0BX-?C zl4)fyg~-s&Q5v&=JL1MZ8}U{&@u$O_9M{X;eBfcF?gC~6p$Gqy-2GJg&ZZ>RfW-G@ z%J6-Bg$e&36j+qYz~u=>utfiFpZpQ6tS^nX6vDfV#7`+Z#WE>KeLp7y6NT=p3`H6Y z4Ao9Uk7upt<0nHkk;{Q5EYBV{s+HR>51`H&DP(ch=FQJyo#&AYlZ8js7vb6S%PvpT z7h8vnr4WC!nafSl)lTbbG3wJ*lEwvAYnWQM61_e}JTsOgkj8T7BX#54jJD?$baOe9eO7P}?GiOb0Q z>=&oT=M!V-jPNJ=f)htAFpp2tQRoJ`2inO605oT3N;*?$Zd~fqCHhg+;su=9d*h%E zzfQM~_gD9qx6jv07{AGWW3xfV)Bbz&d$h*BhD8%=?dr)}E%J7}ugEAv@8%!3;2TM; z%-+GCnFkqr33IP^;B{ZI-=P&ep&Zveob}4I%)`-fWk6>CG$=(pL0xah4rZ$}IQ`)7 ze!O7oFrM_B+*DOG*Rmn>+duAU?ptSsaeOg)G>_)q;1lpC;}{-%4@cdq9p`5t8>Fc_ zv~e*Wm4glS{1n}6^kG^h=M10~;ZVR%9;s$k`tz$Zf-@Cy55&4lvB*?7-@jtD8`*d| zAi}YuR)+v*@m_YCH|HZNMSVBs&U zLmpf%+8L4UozkXt&<3U`m3XzYixL(hyTu^LC5}!QJw?csgv?v^sbedByqK(FHVHBF*qVr^q=x3&A3_TR^zp6nj~v; z2mPI!?{O5tvAf~Rh#;%qmJBx4v2WWJEhMK%8(%ZQeTeAM5r=+*liHEiVcHncgZvZM zd*3}@@SE0@;&;c9F7;o$zqo&$#)v~2hkdnNiMiY2N3fV$v#3%6Z7yHiweeYzTkmjJ z`uG0R;M){Ly>V7wjLQG_Hrgs3RP43yLegXWy(F;leEIxvmQ75s?tTbH+zJJH^QjbmXG3|U zT3uK`4ibrevIp0I)dc+*v{RAj$xmK6G8S2v`iI`uw$EOzJh&?6miSr*-BIyN3taPn zwPTn>gAsr&k;>wlbJVMQMx;tI;aH&}OK}x^j^HCLsXJp|JHa1R#n)kfXYI`h3$V|D zD8M9-%cHFY=7QGrKgcuZ;kidJ9-74pp`Y6=pSh>IRLeq4eA?wVX_&>Dg2!hNgl`X#AN|yt%0j8SczI*%uP(`vn z@hj9}rwGH_`3wLj840gEByVYqd%ZLEnqbU;K1=%_ya1JUU|Aqh>7sh_E_id>d&FLW zRsf&Eif~s>yUku7>X#?z#Cxm+#C;rSXrUf%0A0Hvbzk6S_JAzYBSkN$y(w(Lus|=;MF^r2fKVHRIADf}@aZ+SwdsiyKt z%8woX(sVOx+E=gEy^p>-Csz3pn$g^MNW^2_a2A-IMw&Rjr3}E3qX#|%s1B@PZ-htc zg_*J~)(OWvB{n)HLHcdAzz8;|N;t1pw{Mc{RcrX-81c@dPez=}d{ign)Cz zJH|4#PDNM|)!;3^96pfrq$;7{wbx!jT#yp}SgX5e&BV1`G%_}-* z-L))>#XX3&&!_UEj*sIlsHJ!0xmaGm^VO(s=UMpErldJr-Z(NEr`p_U!72?tdrbF@ z;CI1x{o^bLlGdy7)CJ+cKyVPZEbprv$Y=7p&J^c1ZN6m=7T5oNp2?m}8q+$%A9V@H zDaMsNKQwt8;{aF_kMSpQ_V zS5uwlINQbD@LmBA&!zEmSSswGF1rka`N|oN=|*5R%f0!!=iFl(I_x392%9kcy4fsJi?|^K zzYCG9%ksUC**w*8gr)ldL63U*1hxf>{R`IYm>^|YDBmHN*UTh?fs`q5Da2n`VUs;) zFVq7f#`Q?sr8|;MOLt4sJOoBuqGc6*L+S!D$kd>jh1UhpORAkgUzWHjG_vmB)AbBI zpDwr~`}a5ddcx?-0b_l^wMu0QL5;B8FO(h28K+i9PFt{oG~(ZN4tnql66}gNaSbkX zTVXL&JUP@xO)LQgNJWdHy%6D@8pn6*imUDdSG>9Tz6U&Sy<-52 zCFs|e>X;XQw4~O!)Udvw%9^n? zW1+kZ-uXV=DeIHED62Z=^att0zF3h}z>6@7>o{>EkK&`e^daGZ+G{54N{Ha_yW-rN zPObX8d+Xs&Nt&>9{@Mpef zGOHJG+~R)}l9T8Ec`f+GImy1nHfW`r51$A`5lY|q@_@_aU#dg!=6Y^{LB2*8>{_jaR7Uazi{FdiL{(%zkU1NZgURIC^jo7in4xTmE_ZPw z70xex)symtMd(kDeHU#{-H5B}OHupL|Gm9&*E0v#ewxp+Mkl{&q4n$6`ET;B;b*GD z_E+PbrmGD6i3l=_tuOW4tv^1UL(gp&@%*uehtu~(F;G-sj`>py1y&?X`agJYAkN7S zOoN!`yNXO-H0O{%iy)7tA8GlEJV?$h&C#oEZ*4-Geo%f2+jI*n8tJ~YzH%p8t+OFM z;hgA>x3;xHDj?wg%=NJC3)fb;B88W2CgwW1z)JKOyQNisa=hE*W~MI)cernmPtdUg zX9kE4f+B#20O1nid8jwhcEhMyvjtZ;JK6BYy*4fsk6%h>+-h$$a3oWg?=G#xgXoy@ zbS}I{4%!c@4w0v*Ti38z^TJ8FrLF*)>NioRx?BF%ouRyW>%>L7-r_6<^0K02?Z`!I6 z+GNYanMl5LALfO^3GWGwHQwp#rvKTAZ^+$M#42BmT8+!>Bt3x;Q|&|2>KWC%xe3XM z&vWLbQU1pn>62=|<>681nn{7qhMXzS-|H`W#%<-&4QlV^Jk$}{FOa?J=q-78@IoPb@m z+QRh`*0jRJaJy5606zaG;9{Q%bJkXIUX|qtUSAh5J7(o15ROtt2VTvgL|5ZPhv+cc zUq^_0U|wKof^?at6Yt`2+J$<$3bI}7EeUy!!!8Ru?;OfL>@NEb_A8ZF#Y>eAf;KXzZc-?Ne?P%<_&|N z;uqUkWzo46$-0uyNG$c`=^G2+QLA&w$j6G{}SAp)ZJ&Nzn$F zaqfWA-blV5By%07xLdvtmwB!S(hwt{9r+Y0nrrf)`-*R>5we@%J~(X*yD~kj-LaR;7$k3DZSYjO6ogLBwYHtoQ<-yZd`BHoQB^JG{Vka$)lm-U zI#;}>h2RKpLUGCIJ z3==3HtnFNrHmy9#CC-Gz&$J`CC^tFU5>eY(Azx^sna)p4R2H`{zP1p9Fr7j%I?{x=f=PGUKVdTR!>Co5;q9iRZ?=5c4m zyoZt8Mf76BSh?7sx$U8fsk6W<6=7*@jb`x=%-MngIR>ncd$yX^oAD3%8R;`;fFW%Q z@OlGa4s1ATrMUsDUx3zc6RI+yKO3rFT4`K>I#})oy=>y1AqmnYSg)?R{@W_8Ux7KW zLVAAiZ~TrH>Jvl1M0Aohz`9&SpauHyH&7{C+1UhFNQgT@N$4w<-{ya&L<%EZD_v60 zR}ReqI|2{0@yZKC?V+Pa>9hZG^y7pwcAz~ouEKI&B%ZTM2SxR35x1{v^R| z#-9~oeqlV|ZhDThzkN=O&op_*5m?2tWf6K^kCb9*G_(6M0)MgJWy?TwAWYu;6j-^f zWNO9A%G-bUYbRagZ#X=ZiPF9Y|C)?7`S1x|0|J|Pc7tAVYV!i1V@KSf@~F5`+0D^H zA0mwm*59Z?T(Ry!J;`xKf0z}3c(T~Y>aTG1c> zgaz874}Sj;<|B?ppjL-LE~{htGU=NzXG1e=*$y#dyLN3}nYY2`Ud_5+Pae3mPPSct z+&mxlZ8H5-0Ka=XsRntA%#;PY%5v&=PKg43SV(3Ts>|5naW`C`>p|Cy_64CvrI7Me zluZ5eOv4`{@Av_C1FTcBIH|k5enl4bm)RO8+v9KMu@+uF zfxuf0Gix)=#=13*NW#{cadGw5-#7Q%Hx+`?Y;i4b(JUw48|v@^XTRU$jTjLV^jLh4 z+d*0P=?+Z*q-$Ux6P@dWw(wRmo3m@?IYCRq9(zxZqNL1i4EpxC$*Umzmg}3=09~WP zFD6rWi5~0qs!#M;hvdeQw}td{y*MQ^1396n#uO$LWUqjitmG&QlnaCsB5OK9C#M`2 z-PBb=K@g68k*;W^6^dv(LA@cMC5QuM*vFaJDTv%p;h5d*&E)X=g7w&)8x85qH?dsy z*?_=nHTtF)oawLFguSSFNxqHOoJKI`^bkKVwSwM5i&Ss6oYJXb?^nb%J{*`ISZ3&J z`v0H@WPX^bqD8S+#oKYzV(4EO-)h1H>6?5gu4{t?l-#;gZEbVTYr~5ULTuZD34mbn z#gg5~#!|ZFcA|yxbQ+p^>STn;8j^3!qfLvMT@Rgr#VBsRm@2ZELuuTD+`l5t?D`cA z+bm;ij6UMv%Mri+Kf0xWINn7*W)YmjV_-y7`3C_8wF34hz4FegfcG_i|{$g3n$M&l9kZY0g zNxqlmiv=oNnKJ|dhOoy%UpMHK1s6xVP1Xa)Ect1Ea)B-4emhn15_cX92`_(~bS-Z^ zpJqXIo4lC~ek^znf1bZnvzw7!8ovsabEdhMHDV4wg&gH6-Y!QgW*oB@@|uB;m0zs4 zw3}lh`D(6~7coA!c97Qr-af$vi3Z_Mii^5hmF~=~Ktm+n?8q7pMO`We|4&+l8x*Ve zw#=6z_OVh+C!3wu8P8ea>azO{{LK=nRCi=jfM;F6jnJTGAI6RB@T$ef9CvC9y>iIR zUek7ZnedK`**fV>>CU}dDMI2iEXpKl1EWR^uPQw4mLr#R>AxiD>lm|+*RY^Qq4y2o@prJsqr>?ZM_pbKl<5Z*Ih9Q=8hPL_hmcfoVA5;8nA#w2wj zgA6+oEI@fs!k|yWsftUA$=0oSQMa|nf35{7Wy*Txjisr~T*Zj+-S0_xe#9_eV&`FW$3|$-`_}2N8?Bp~y5FSPzyMx#5hWFY2U9Yq*|QGIC) zU&*C61oROeN-6sr#<>#{baeaRaxVhKIBdlc#==Dv^d7%yiQ0%6dvp~!F|4mkX#J=N z+9RT+oKyTI6XwSyh3M*i;lhnOQQ#9X?+;%+X|CB6e__LE9xwl_=#rI^P+=5`5XO}w z%O@KnN#+~vyoq_(dimsXRzxkRkaky?`S$}&ar_^+|9?05Ll*yvRdZ^ED~X(tWDKVS z>G!1Kv+#iD7~dT;N^8PUqLILr``_;w^tWm!XBtqVl}0D|^8_?Oy=tL@B72^W`IMa^ zvP&i0>enhyzqmeC{=%;kkbPloEYO($Ti!zDqwY^MDC#5C<8t@2NcASJ;>SQhyC^m$ zaN)TiQ0JTy>x00F+1G%GgDLxW`7irZE;x+g>!a4Aze)}NkFm21j-zQ3v~4lVVrFJ( z#LUdh%*+-`7Be%$h?$w0nJtzqW=5aUSPAKqBLT)#J7l`kqiDEcfl$n22p2agVLW0#E0q15f=Uo?CWmUG|e z@J@BKL2kJ0JBhmk%HFGgehh~D*n^TB0eIv$=jSL>lzFP|O>IqF%tXph6qRt+%&zA_3YfQw zkHU=FK04?IoaUD{F`wF20>J?8Q1X2!e{cb6#LL(m8%lVwNC853@-;etD=>MmNYJ7$ zKfb1SDA8GF<3a6_ahb=+imyuNK)qm8E>gO>lc;*YV=#=R%mJqTVNw=n7YiLLoS)yX zZVKKOKSxzX_By{fJ26*j8ykA zjblNR{hlyP>aocJ+(-P=oY-^Z1nBu9Fuk`yu~1wN-na~b^ufZjL;wfZuk-oIwkn@! zkhIR5i`IBI5{T1K5(1ima&=|k7RutQ?_1uuw{Jn8IW^)!Af z`BwBL!Y{y&tjq~9)*@PEFe={=Rkcv8PRXMyUu>8<~9+V^RMWrTE#a!cg-D72l{L+y6wO}xAzztvgHSU&hpib}rM zMUgZ~k~G&$KH%!Hko$9s?^ptx--G28c8vAU-*Xpn=kYmCHeonZuk1G(y5+Ek#qL~0 zb|P9Tr5tq#gxF2On-f}Ct2(n~#QSM;s!+zB$|M`fKScLSd5DgqLe*DsG)9c2_PcL_ zMNJRGLNQGTMBuIlzs12wl8XlK_^`OA2pht+943paY>LGFG%f4OG?N07m&x05w&&I9 z+zRJq!3@9+-`ge8MY#N+`?16!#`((O$^_@vE13Fwd^9J=AuE) z)49lwNPVd1CcQj>z;i>rfmGUuoA^xCCD+^CP&v-*pFWqU+3G6YsJz}_g|(=QWb@)G zeW*Ih<6)Rk;MBN)QdPKCyC`FpxZK1U1(W`gvdqdMrla005r0u3W}eUiRLQS(sf_Ox zXx?WHnl4=zjYnxS(V&~J(@I`kR#N@lQC3lQwVF3Z$U@#C-B<<1m!0ks$<=M)exBM^ zbpL!W)45`>zr4)4!A8beYMXJ_c(=HnZZo%)wy+=i4Q1ymxHZ7$7ov!5*5HH<4# zO2iUQ50;%BVD^-ZEdmMyN3DT8M3dPQqO$YOP2%w>AtYe#L}7_S?nDyViNx`dm*R>B zq#9Uqb;^lAaa@a=3Fa;kSFEkCz~^keBi;u)Q5W~t1*Tk|HDE7*6Cr!e{R&p!a{m05 zZrswcg9K}XxiXtLUqCUDd%`q(Q@}ajaKR(dO z&SdOg9m#B&X9x}f8HVV~%oC(#HYcn_O{JE118p+0TV zBEa3d^?;XZTh?3}sI0$Lx{RJyYOQEWwNe0(*D91C#-CpN z;HUCmt2)pejL;nalKmf)>i z_UQmED=RZJE#Q;2%*60%n~{Nm(Yp=p^uZwbAGvw^jRk$|n4 zwaMrGpcgW*7dNpmGyl~6#9GrUeTJ7RH2bIEZ=ig_k3U25AL{OZ1tkB2mHu}~|Bry= ze-WVnD~8uwY1}q|0kPv2mA5Da(%RabBv84qY#eoiR}p9!OBnLzJ!va0M7)R; zb3IjIrgmzt$BovMrT+BWL#J;beF+P~$k!Q59WG=ET$^G3 zgZDE*VAtz5dn@ufcWPH~JAs!AxV9C7^=&IX4_!?E6&&tLn210&03=H8P8*J11%gI+ zwTXduQ?|Hr`HhKJ7KGtvY69j4Hb#hKK>68Mf0`R=v(*GW_b*FtVPe2CZ2#<!<Zxkd&HH=_7wP}(TEFuKSp#$C741mKgZX7B6C#dDM5 zG{#5E9!X2UBZ-7Y%epRK>|sArF1uciF^^!yV9|AAz7%RK9Za#I#r z;`1uISbBVjzmP3%w%$0d02PPCB|scCi6E$}79bXI)g~zMi~Pn(Q4T}r=kJeAkQ>b7 zXYP#i9al`7-&q?&J+n9XrkA`hzD-pL45ubCHg*=$W#cL|4tnzZgke@ z471?Q>PtLU!1XG0#*TNq)VG?bOfZB@Lf$#}7e7co1G$_4a5qG429#uEa$|q&U;%t0 z!)z~gNmUUuVYwbsdwg!Z4H4TI+kwN!o^pF|t*bEcT;jXow|;%@OvEm~@S}cxe=z-E zI)>C116Z7)9Q<0i$9|BjZ>^*r;8rfZ=jytY&H=WwDfx$as5GdTNzo--`j}^=j?UK3 zOh5r}nxf~~{#iLfY^rUjl}MX0au-88$UYW(hzb(k9Tiv}NIAhkb@Rd*A&>7?Cm{Vh z9;r|f2E-k*nZ~)!!kn1zN_Xrdv5>7RMWgoqC<#7yl(R_ZSnr5cej8U<4=L{pVQQhL zh15hQvts%5LP%#sSnk;!B&ZTU*nXjT)4b5A*YKRj62I4Gm+wqpA>sJ>_CGq?I@8?+ z6RO|pNl-z>TJJv@oD?`6;q2&`M|4&WENy~X<${YDBb)_!k7W4vC_MSMh%>cDYRe?Y zU6uw#7uZ3S2cGwN5`E7R^$8(>ad#%>2IJbXyCUllA))Dxg(XoMQLgjn7L1<5dZB%n zU=H&|n+}6hblo@K*KY8=6T9-qrwdtSKzDxywGKGH`oRvm)sJAGz9K=+6vi!TIihew zt6~9;%U?Wua7XzPIisge)hC%A+qqF}p0*GraXwSDRI9sWC zv!)P>-V9c}M(KdOG(5)d8NR|yz^Cbk!4#?L)3hak83ozLyivPhea7;HdLs{T&Eu(E zDs|?(Id=8n^ZsJx3*!qvhj2W6hy7AO0EHn;Uj zII$9m4i#J+t9S-5r7*#yr5x) zi{?Fyj%b{BuBIo7%sz#y1cK3C*OXYtxs*F%;*lg{?$#_Cg2%WdV^sG{?g*Y9UU{(f zY`OMXLT6Z9xx;y|JGwSp--j8HI|d%X)7k?(LXv%$^FZD{iXG@3kF)-$J{$vHN7u9( z1ve=-HFi2L(uKVe_W|Z>GKZ$8*cRPibX~MMbUGn?F;euBE_3U1?#CU{Cf%7t=p#z| za5rY*70&YDR1t}fiTKj<;?NdBv@-M4n1ZgIHdFMC;JQNA(POQ5$QDE&=MIl`=Wv_m6*)7j(EPjvSbNYZF6+0yI>NPnWsUQgOZz3s8Q*$^=K>=% z;zbd~D=YH3Ut*s`A7-1?0Y@9G`<0ON>U(A=HP!>`q?9N!T>r#I*Nhy9t4s84))ZRohGyCM<+)) z1#Du)eb`SI8guEHq*e~13%l5bS6CmvK=5Np*%9d9-+exo$eG|zwykzaHGq(ZLY>S) zZE9;2HYvKXn~Cd$&4>L|iAU4&=`3fx>TcLw!8xH&!$T02gpudxfj0P1%X82M_;&<8 zsliUH==W|v3dq^?DVvEORrkC$L!9pK z!6^;pE!l!?>37=zkvqSi&esADN*YgUWbk`aM$Reh)T|a@ct-7MPRTL8?$rdtGsIf= ze!ik1Ez$6V_7(CmL*jBK$K03js)jZzfm6&Ae#w+K$34bB7bb^!wT}uje9_eIjNd_0 zjH(px@nPmi!u&EA*Siw@BaC1v_JRQcXHJO{jrZXz;V_4XYB|VmDp+oSKY+# zKJcDusrx!lm!{BCGFKGN1wlrWyAV2LsJjImd<*B-$>V8zfmrVrb&cE6hnSqC%@ys0t0k^Ju#sW`0xX}5 zfz^)|6SI%k#ZYgLt9Q3Hivin*3bIQD054|j7`0YlS=$MaGP_(B*LaZEzmD-kLM;39 zu$#yz8lGwL=ev6Bs6;`@&aGPu8DDpi3N;y#Bm$0-wUHX$XJ&d2QQC9FbW;QZfrnoH z{qxlDDX@_b{aCoGcum}@A@Ru$0b--;FWOlhsCWKv;aHl3*$aa?BydJ@zXx-eyL3?n zB{tzF%;m}$P1ru9YTA)KWlpq(es$lvd7Zg$5$1ITh(^KQX=rlyIAYFZA^ypcQQ;i(bl%HMz@QB>Ee-A8+?nfGqnbt#pn zrgtb~^Neop?)O=VQd@Bz>fu~ED!k<{X`sQZ=rU$J;Bs1C8=mKz9hCT?(bFM-jiIa2 z;wfQg{cxesN1)x5S)RX-D84TXM=-Z>o{ry7zbQmr36lTWJa%nj=*o_xP zHeBzF(cp1glaSU5$~*^?rM*}k+0v;vvIsDR#rA^oouQJ?eRY*2s#BC2zY~rf2>p^yBntT?U5Ap_ zcT0k;%pBagJ=vT?XG3vr$EUJZorf~UV9gh!cMcs8ol6YU4T*(u2qL)QuH)~7hjn66!(-Ui6l`b;yetOv#FSQ^x&@L_$dWXlgst#txW`(o)TUJ9G846 zhd&_+Zv)+(i)mL+PL#DGy+?BnL@WO``EIt8gIO#?oZOPy{)u%3WziVo?XaPqHLo4p zYTk7oVU7yMD#}R`VJZD3*a)4CthQVpr%2GV88BKXpSK*Foux!ZFaGy%__{a(`_fDIMWhiq9ZwA5{v}C@|qtPS99N)Qm48A1{|6uP>4zUzIfd z1{C@N9aq3RK7hIDW$bCvZ3L(Oqx(e(Zb0E%bbP#PP*^WX3Jo;}E~hgrEoql3Thz&C z-c%iNF9_2u7Fv6wvBmzHE_0zf_H%z8^xTB(aM(*BF!Vy}prKi^E%|N|FA(1A4=Z<` z_mkIkxySx=$#F?^ahqGa#_qW)dBXEB{wjC2x(RY`&!s_+Sv*?(2;~q}=QLiAkRQdq zKWx67fp=y~_t6I(}*ZY_I?Z)m*Uzpzg zxwLTT=!a4RE=6lisvI( zV=VPVKO_B@HUlr{1Z(Z|UyVeuXV=c4l=etbMTq26Tog`KaC z73Ib1dgW45ueAy`Bh|dYhFN+VIw$tYh+?mAlFHoege;oPiwb>fH7q}oH%#fj%RC^B z?(?5ej2`BM(=weiIss5Ih4G``awI-OZ{dlPowK8436NXJ!s4cwhT zn9-dt^E$pLvc(Y{;t`ziH6=!2xGp|X`bs{Z+*jw|wx6Ha7K57>z!s208_MC3!c|Ka0+ODRW8$J=2VwhrB-iBvTsX*fq{7*;hl-$S2zoYI|^t0u`Ks~eXSS(ui8LyK6{ zl*(Rw_dh}cWj?3LSE*4{01;_k(m*gYrXLj%i;q9&5O3ZV7kNgDOY3;3AQw}RMn>?sZ$xbbyl*ArgP^rsS2l#By*W~ zPIiwqv-Te|Kh7X@#&q3a^w7h0UDgsn*cYTp=H!>+mr2#<1Hc5ZjN{R@)rydGcuGW7 z1S&~Ens~$@OGyQIh;?ZAX@_q@Pvc1(K~0NrG4%=2e3KtT=Tk7zW$7~5H;nS#%Wj-a zT)cf;F#T&ou8ZRDaj3!)Cl88*UWpcj5XZiR;L7`lj&F30AW|_BIx1qX73|7AjFwwX ziA;6eOY8&YPV`Ry%wDZz3cQocU~`(Mvy<2y^)$jY3(IysPLD_^3ZTWkj<$3j%F%e( zKg4@pq(&szh~G)g25{dBlA$1-Mhhxgda$>%dWd$TfxP0rB5$Q^stshE0km(=1z8*F zoc-vM9b0}mcCs%q&oWY4m`?d;qAn$El2^)k<~&qOY30;c-OmTLzqikE1CTGJJLVe+ z#I%2QsQ;X2DLj{iJt(KiByBK(rMBYArL}1f*^Ejkk!^18+_9j%RXkcliH|4Za#|HH zNv&j1n=Wi^ytCanX-GDK9m&N!JBLl)O)0!1KACtU20;?{?;iOfoSNn`aBj6c@}MMx zp_iklu_T{n_ahNSS@m#P?KfaajmiGw_IpWH;K|sVSA3QZ&V|0THk(gGUL;O(;)*dw zAeq73Yg?^e$Lqs2rAqywDAuO$d9;}686Jq z$FVP6^*sB}Ja*h@GfrA-4lI_(;!z%ms2Yp5-Jgr~Av}2%}jS;}*TQC#OwR>PxnE2n7c`P8G)>8d0Ya+fjg&Q^Pr$%aZ zih;9-&F7#dxJV&m#sFT~Ot`1E2WZ%gbi_Ck1tR(>Hl;^|FjoU_nZj-x<5wE1WT5kQJ|`q{z>WF!61-soL=HhN}^4MZ2AJ3UKSoM`AHUU{PHa)RylZ0M$ zr&4*Gkw#<;!HNyt2k5aZG2ml)jch0-Ep7&XbPhefA)&i97>-H4E)u7}*l&y1oU?jfWc2)9PG}QGaH)*MTcvV%P_mtTzvF1MOt6u;$wZLPeX>g>B@R{WCecMbNlh zn3ft>$}nJ|pM?Cw{0TZS+zoX5r!t7E=jedI#nb9|pE#Sj?Ar*7+#QN~iXq_`DYd+; z-H?8dKU|9IINKy-7t(OnwSXYY0aa+1*?w>fRkVr!oUR%TbDrJsaY5>$XYErhD8h{#4GH zgbsrS0>^t;Gi(|0rzuvPE)dY{V)Vw-QQHwwmtX)=8SZ3fy3 z`}iNlrrg&ar}%NMOdn=#7Hf(VRe;bH4no4q^fM)vV6@sbLMhdjc@geZuI>rIl5CUC0UmX zcnWVJC;2cQx97Wceb{L9+9Zjt$E#=U1pnuBD2j`m-9}mzPxv$0*TVwr<4H;?i2|d^ z=#+C9isZC}a=fy@gkZO%QvHO!;(ph+^U9xTsqbH{@>IlxM3g}S)1oPmR*0qM@vgtp zoc}1hQ?@Y5Y1uqf%?c9gUj7P#iF=>auuqj z{}2fmyV;~j{3M1kpNih3i->feLeD9#w3G5-ilfM6P$milF2w<68vC~SLK8x0a3=X` zmb5QAjES?XiScAKJux=h@_E-OF7x7AbF^1#m&N9|p4YwC!6qK_b8+yRLWC|6^47b5 zB=*K(#4?wz=jrh!@gdRSOW>nEpTd5jsy2A`Oue$g9B7%PM*c0wPsnd0@qtLWsnQf* z<0%Qc2Z0%MvjFE)eyb#vRMRZ8LTQ|({BtVJQVudimx+{ZB8T}wLEd-g!kueJmoU8% z|5P&q`!vKq>DU#jo{Rxo^@3 zbxoBNILRwOSQCe`lvZwoSZ9k|W;sqOiDShiW*JV(uv?M^S>-RtsJ#5sUEtCF^25$} zrwam+VfLLXbiAqaQB{c!kBuIjP05XRl%q)xnY6>M-2S(xl*bye+ejOH@&+amC0e~C zl0J9#8M1h-8y7_h_O05052xXRW+^wXN;2U!o#;^z(E#tkLRZR!#57 zAen5ltBRcAt$t4F^jX^69kB+S$Q3mVD!;|6Bs{dfRFZ|46_L)N%vKdy5w2;%mVh0N z$RQ1vD>bA!CqDt(6xQ=DW@XFGSRS#ibzPz!$~^czbMU?+l;fKZAFJ)6pDd@l6md$u z0(7W8G<3=~q>|!(Upt4hD`WvDbJ-#!<<`{BN*@+QQ$#j{K)kriS1Wum8B&eRc zqA_HEn%XOf2+>2ERKSua4qs7IO9*tncX}lIWs9PB!Yv^o%9LrMI1QI*%!h-93xp3s zqQxmj1gi!cN;ZGjqOG+xdcF2EC#O?YQyv-^1sI#FQz<%|RE4EJOb#lnn8BygHYB^w zWl@N6nu6j21Qa;2b}BHaGcX^N%ySRcrT&=E-@l&0Ey67p`c&gaK}hqf!DHJU&Vy=s z*0LC=!-omkZ>HvpHK&41NSEw?z8Zdx)tej$Mr2ijq{AK`(arxJ#EoAX8sNQ7}z$bju}!YJ@Cjn=wo_6K+IjU-^n*WFgP2DFZ<3`JzgE4DoeiCF-#nd3(Jv*(6WjW2FN84PKw_G%j zY6~8NiKGHe2Po1&rU?p1y+G_B;?KzYC)Hc)TPT=~#$lWgBZr~pK}l7%lTYS5ip71mkjH3|s9(j&h8k4erj&_0%$bHK)*c28%9;;N&lNT_q_~0^l<9@O}Jjp-}#k0$L;#YD5Y(6Gy^$X(8ZZ;lt zIQ?ksDhF0oJnX8qS0VJr@!&2p> zVZpuBO6h?AWK=oba#{l>@O&QihEcDL)?`W#D@EECSH0(h)oM6jPI3Pv64KN(v#4 z5iZInWIsL>XU5|3{f)B+QXf9GO{4FXT52{rXS-}$ZC@%(CK*C{;%uy8!z$hcn5XzE zeW9oBb#b>KSm3Kz#*N4;i+_Dh@5-eJoVJAM87aL=^tiCn#+cWkp zUDLDqg^zp4#I=Lw+| z5-5diTeo5Ds|J`>Gi0}P4k-G&U#xgp^DBZhUPx2D>8SOKprhz z5Kc6VHXLoSQa1~yp?YNDlTuyp6wE0p=fpjYIY`z`L zypN*p$@fFQ5NJY#lA&!Z;C-*kBo+WMK-my3@li*iA8^Zk+(;P*%%83w`Cmv zYlxKJu5UpR-InHxbQ&H$7H0(`k;hoeqF4RV|bn`MFZ+8}%$3RYxlze@Q zmB*~I6PzcrYoANZ`ir;CCZloJWo(=l|E2P5OcdT08ebA$7Khv7b1~Iz%;3@8n^DW! zpfWle3?LSNv{WY!9|G>>d1)XX)))aP5_jdqxHiDUzkUGAKG_@xisqbp!?c&#D(tt3 zIL)ZtOPC!axyur#RsPT|Eq-A%iG}DG$>V8^OsNlwUiH%48L9mQyjsnI}`ogGrYS@1Y;o-6uE9vS>iX3Z{;meiy~{w;{+9J)HF{ z756R*o*XKH#cB=OElC`5jk+#he##z(%~Bqllxsqr;AXtB0Oq)<0lS}W(g25ThUFK- zsKm%x2LXc~s#M zH#Fv;>oR#ssuGt{rN5;fd_x8ll}iJ{c0rmmAyysnTa@vB7Es7z5p4JMCIt=gQ0FD< zR%V#oH3&BpKqfuRFcRzv3(vAf7+0~&y(x23g&Wc`o~~&dWM7rqqeS^+mUA&cq-S}C zVp6PEQjr8cC6cwPu=i8Rsi^+;JjJdYdx@|agF#c;VD+3(UHfS|C(m3&-I_NLqehUE zYx13BqK;(YmwmW1F>R-hnq+oD?J!LvUAFDU5AVYwRHn`8RhkGOvRsQH5Evg zDXJC-n$kmEnPDpxOrJdR-qf&i!!H{V1XedEHjL9ojoAZlu?{q*$zTM~oY+*4Eb`Fw zPgq_YlA9lO8j0T;XZviJ+OQgDiQXl@()=Vt%FRpAF<#O(tuGo2M`9?YOOQaFM2px- z)F<@^qxeqi4@^T{B*{P#VG0avCvPd)ouKExEJ#K<(d6GW+bl+ButO|J{<6LX959O- z57=6+&YQcmF~54nqUyYcn@Ck&iNfFXcsryB!^LAn2kI%gx26tQTU_`nV?U6;mYL?q zhvvtRhy+K*aLYpTlA&oUjPOa6kDQE@$Hfrg@6k9zZe@u*V!g+09$G$ULB|Hyctg#J z_)ZX#Sp^Izq_;?D7F#RkF#ieYC?5Hdsum9s7*dEj)_(%>8*X1h;68zbG{u-2)wsnm zq()t{qRe1UB#)Pt0L`3daJ-Sa;dxilCapzOi@J;4CtrgrJgeb#P-V^MlL08N%v?5R zD#9>os=y%OM@;=Qyh`6MA1?a)sx`dde&5ng%??5Sr8)RQ^Ymr>{Q$7$SIRk~9pJf= z9Ww_aJONV4LJt7&m>7yTHvri&BHl5H(B}b{8y1wGX<>A{K9e@*u37&{tu3o=>?#=9 z)g|Q;*a-iG`orR>O+`lsZhk#(Mfzak593@~B+)7fzo;;$fwhXUoiTz|a63gSyO$U*BBMb}{o~01u$)#ox9?@U<`Bv*An6bQ&n@S`3To=xHUm5R0 zj6j!3ggE8!BXBJ^ThuvWFINIZ@fGP2a>2CPdIsRvp~`43z5>Wo+pwb(sKvoBrGr_# z1ju0mu(FGJrupWt5G6CQmsh-hy=L{uQpsM3Qz%e4V43;{?Fvd6-`PxJ)kimhIraro z78)zZ3(FhhBJY0@l`nwNAgXeZCS6Ej(bu}YeMUF1InWf4%eO8=4u|#|t0bW^mLMW+ zWY!YP5ghHqq?PKFnOE-1+ND`UQhg8pV~qwgONvXOY`_!HIv+?>n6-tYhxOe$ficIf zd*~wBYqWb0dduDe%9V%%@pqAKC~n7u6Ur&DnL9R;Sn0sRxQaU_)tn#|Th)0&%&OqD zi8FkX#k;;;)k=n+bpRyn*_>faxL;BF-F?D(-LDJtL zx;HuD4O%g63oir=4hBiu5vIW)b>-r1Sn!7-nrqhcjn@(RnM%%)nxR%Lbzp9&@@WD} zH>HTA%PYOqMb;eoDgJ{Tt|f$CtDW!z=+?j)SN-Mf#--ljR#wN>AI&&}N>c~MM~|!$AYzR}Qwwna*0dwdLyX?-Rk~A}b_lRA<>*e5HFe{5|FBnyE78p~tEjnPeeBSDg~+QKmU^Mv_E z)&9cO%!ET)15>wu6y#W;e*Lg?*NJ42pd)7uvigtsZAVZ4iH9T;=EBtX^;#jZy_KZr z;mw&YTJ_$E_=s$M^f2pGvRrq-ritob`Jn*(r~f zc5L(0YrJp#gZVmfYlT0wv*%{bsOY!a>=$&rmFm7x26(uHb?njH{7!Vm)J@!AbmDe2 z!H^Wflng-lC`G_Zli#VZmy=9A%RK8^^sKi7(=l*g{3v_$el@s+dJa4m-VFl+91V*H zWF=BK=$%qLsc8~uCC$vleuz~wD5a;>@V5aB;_L#PIz5}6inSWA6(r%;TNn?Z*VnW( z!s%%zEc}VBH-v;|>|wLu0Y|B*&Jz&lmxHxy0pl&9G2iflYLt-zzfr@e2Y}2B+>=ua zq)hYGtp~M3CcU5B$}Jw&PyQJ*daj+P1u|~v;C2Tga)q5!Fb-iqA`ZC5*;yMwNO!qh zuz--0d{_G@Ws?)`BD{D>>CW9kgTrjI-s@UalcsgsF;TuFQV?HyS+RIfGlajH0;Ajy z+t826o*kbgMJPqkN?B(UKW*$h(Y31FGIosHyi;t|#*S_3ZByDj-m302<*2}|%~bHI z(^NuIjCW2g=9VTvxy43(LGDL^X!f^M8cDt zk3kx403IH2W60bs6X~JEzKCoeUTJdLjhn%2$Y5icFjtUv?55!P{nG-FR zkXf%`SR?vi(rTF{&?%SF@XslVVjZ zjZ*onXw=@P{95vK1LJ*F;rwybIALp5cHTQ=d5^eorOl#S! zB$sU!K)1!8_g0Q(uzE@=EDVIDb9<`A9p@Fb>x5@taqa32#}OCB$|6|4VscYc0wk`PO&k6i@aLJyz8?YfkV#gK>nTha9W4V zH0Kp@*azOP1F4WD#Ca9YUEP(m34y~f&)qa&XX1e@Ba4R^atn+Zz)*VKkoKm~^91Lm0m-YU} zmf@_xGx$S>N}^kRS8nzYNU0Mt8hRSvx3LVoX_+V-xk=0g6UFtB>xMe`4YI~C9Q67W z9v!9e81Rp(}-F6X4@ zoK*KMQnl$+(+Y-fr7DaX($knG3`ZDs@wAaR&dvC?vTqY0);FD8o88KoYuJmPVWr^~ zehSE(PNFItZ3a(u>}13X4Q$xo7z=>_&Noc%alvJFv&7SxR&Q<^&Asu}J_9gZj{8@X zsH=pj?>6<~_KI?ld?k?{=c`Fh?j(4VR98aC;eV*Psm&zaNX|n`ReFSe2)&5&7<16! zrgYb>EoW@z`X>A_^jIX?FxDiQW`lyK3o=D8ijz{-CJpbVa@S;%X#(5HFD!U*-V=aL9wRR%o};SD5PS)3p<^I$8|wyCv8ix7S~Z_-M&~ z9xUtq*$!CN>&`T#A-&lh`MCVReI9>13KnCtR&V#{2VdGEjk6j=UD4wL_VjzK%Z7C# zmjY-K)uoX7M|pyHmw3Z?5`1MeatY>v?*A zL#4mX&xxlKPn$}OG_8NqEGOsvHqdSglLuckHmbW&Am)5%Hg)LsYWwBS@E%q;1+sHx zhIr+!WM$(tiql$tn0JgO?m@D4);0%@oabcKWz38`ZxCN%`U#zNdG;+fZ*jQp3gWE& z+#|{n#v&>v#!p1BVkbP?Y&wp!ImOot_}2P`|7hg3OJG`u@cW`@usAX>gCwtqV~%x{ zb&}H3(w}rg@Um8zwqb&)N07GBxhT7sf@14iQlqx~xn&-w464c>?L#bb-1E#+uPOH# zO3w4xNp0sdzQWrP&EExFW3CA(p^SN9>^W6;_gvt*RdS~aZBpTe6(zx4EZKzKR^bp) zSgj(|CxKp&j%lK@ZYXs7@O8-&&M{UVsw$B!cky%qFau|(%Xc&%wKmOBalS_QFbdh{ zE5oXHCY`dbBZNxP!#hxw&Q;EpBknaxWi4Ym3$4p->rH%5B1e(;&cT^xxTd&v-ZLG2 zt-MCfBFAWR08dl735=A^p4J{0v!NWoV)Sa~zT>UQ(pRX1-V01gQBzr0F)4@G^LjKh zsyZ@`h1}(IqtTHONXj%!k5g=#dTloH$&vQb#XjJ3C7!HD&EiAkIpHa{V~knpW1gy9 zbD0ILr@t)^26X#<;+=h4hSv$%#Y53{PlacldGdt}oY}M1?VkEV8;Djy-x4sF#Ucxh zrUBX0lX(<&5~sZz=ulREMlzGaSDsw^#%-x(tAULbm^H=F>F?9J1BM2`k8P1~6%EmM zx2Gt*iX=|pDGn3mR=$G(nA_$r3Oci(ZC?Dog%uDI4dspu? z-x;2o2}}Dd3lCZpWB*DVTNU|kF_SAf3BZG2ZE~tEQ2R}g*QsWVKYrkFGGoBFrc)8OZq zjiNq|k$4=946*Stx@8#dC+!GW;sPJr2CbA}KS@#w?3Q;BXnj5|NYC3`<$i?B#Ve9& zFFj7ZZkG0HB;vh?(b6UGq5LIyhU9S zUZma;xdFAUaawu}sUm(dGhI%)tIB+qwgV$^)+}MWV1rh#nTz@bC3d!ilp!Sh6t#|L z%0R-TilypMTQJEg`&4&XU|Avsl&+n6o60%t$Zqo=h{om%fcO zt%) zFV`A+t1@r3U;#F#SnX}KG8-kvAh2X=Hkk$E;P(T8S6oqhJf5ZjS+QkA#ibfvWn<7@>IxT4X zTM>?nf(gY?-VL>l)q!qQjqR;(nr`xJP#B~wV$HKDm;`++{<8vr{wLZ~B!QZ0Eao?xA zCFvq6NFs<%W4y$LltaCQnTRd>*MV}z=zml^nu0E0=3zGUMsc&`UkXOecDoHeo(>D& z=tDXfYsRPQ&8FE_dJ{+QLBh^o>g@`xfP8l~{j2q6X6wD;c5mv`#>8RC7%PyCeb@{# z9bYp;9Z6BCfbR_9I3d*6t1O{-f>glJ3<9KjA3{I5@B^r! z-@$_k+IF}g5V30`XDe`I`9dhiGVGz2z(Nl~8ABm}J68m}T7)S*i!`{Wn&Bpa3~z`< zd_;DECK$S5)70v^va7HCJ4{}iSpAPu{RIjKbn*;v48Q!2Z7crpE|~&X)&Icl!p(ZX zHuu%NZr_55hY;dgHY|_#=hXjj)6>95uVuI~gD;{Nf)%9^(Ej;maEZR_d5qBw({E~> z629PH6ZQo&wK!4t8;kh5{yR^k56C6kLqV_J8~xTfRdJB-p0Yw!+Obw|zHDfPL;H+Ce+eBT)H#@}F4LL4gKX6@pN zBJ{S`oI#Wci}W`jc0YlKy>50m%W@eZTH;M8C=zyX{*{ zc%qyhkNde4Iq7d1?Lo8cye_Jv?c5Ix5sH3)@`Ap((-?>dO;N|*-F7tfOKJQL+Q#?- zGAA5}e@$kHnZPEDnjd)3`@%9&?cgqwW>7b}A21r+kTZUQxb&co|D>&MIFWXG^-hh* zD_qrqS;ZV1;g0EBWy&=ETdKpFOiS8yH*x%qMf*a}=-rrUF_4#Fv6o;8T$g@KHM7n= zy#Jo~TPRkvh2jpyin~kkVx_dWJH=f>ph$6dihBiz;t-0v zyQOH5Knei@1jy3w{onuo?(RLi_w4R@XL6F~OlCeac}~tGGkHBh*ms{na={0-r3ed@ zj}_&mvShe8Cmm+iAwA)s$D=OfTY@`Cb`3d{~V=iyIw zJ~c*9Y;KxO?a_PJ*~Re-0x!YkC@on+wH$M_2v7!rV#!hgaw+$17sKZ$*<8yT-9AZ$ z2ern)p)F#LqqmzIJ@>Wx3-5i-gDLIGFn&H9uN@uIa>7)e-2j9sXRypQ=1I?Kn~rK| zk{g;g(}E)5_RS~rU8dtjg#_-D*1dX?3U_~J&FnGW&e->_4}1NavFgZ9i$a1KrRDvh z@hxdp`06t7i^txATh&<|N=H`mL% zeyHpqYw?5wm2$|^3fUw$SJ)pVH%9~Wl)7b!4EbmLb+mN;&6!N!4}c}q*t*h zdT&;Oc%GC9F5(Ol)=y_1krla#uc8+suWKH0%MW0^VbMZLUdENFmY8FVn1r-o=chGw zs|f2>(lv_Sh*JS_;X$*KehQD%HYa~uoN`{a5+GX&W?+~tT`7Ld;h^%<%^sOaCdLD0w`WlH;q(8udqWf^*Ty9O; z;wX()YG^O^9hRx$;0+(4BYHEc)^ros33gXYM8FvzyjhUl&0%u9$|tbIX3|7OXUs9>PA`Dq)@ksL8=;oTphHBDUekSV&|YT8A- z$1XPRbwWE&FQ)WFdlw5ZQZQ-!rA4Z@&3?(&p@D0{+_!YeSrz!8ijq-hQ83k?yl*ID zeHK3GCp=zx4A!W+u5<1wBRwKK3Si7kE!%Nw<}H z)lhDJacdZ$6J@4^5R6w-eA^>yRHEq$JQc`CX=BGWyd@-4Lhxb>$XX>iCskz8Ck90v z^qzLwy0eq%M(Oau%!%&5YJjkANrX2`HX0L8Ph}793IAvlQ<{@N^r)JoQLZZpna2q* z<35E6)X%Am(J4Lq)K}A>B*-R*hhPBdMe7(*Y*+ytuI<)0h%-xec5b^JSw9nU+TkxA zp+>QFy0?9!?9S=yq0_fZPf&*Kg?UOmmS|5qcp+<-lpXi;-ln5&1Kjf#o2g)=zi26A zGKeDc_-(grg+<^?Y1DHMa>{LahQyFv8`+Hr@}xxo`Mw>IH4)}^!!OBTy$F^-n&B&s zxjNr#WMJTy$3|e_9}^-oIiT0WSfxe3T{UwimiORI{)Wbxw}$z)o+W>pM{ZHXZYLcH zONo9LsE<-?kSA|-pZUn>s|f5&=+a zXnh=bD4LLWJ|{dXbh0;0do{lh97)CaZudT=7IB0Mg_60!RNE%cT22aJaiSsJKKaT5 zdx4L;P;8c%4a!0Ngm*G(!M&&}Ee9vUM;xu=(Z|jw^0$M6()q!%IY`RWQxp>=;$@H= zzI5R@LR_904Dki#iuRx0E_bu{`EC{4qpzerZ7!M4*cBFOpu-mwgxGC4_))^jxkz$`p_(Xnxg|AUIvsC={pMf?v21Mw~2NDTlT`Rd!yfy$K0CnK|}>L zr^12-gYoe3s8P=%)?&Tf6#S&v0A|9vxj*f%$p?gZ#dE4$%i(XW12AKFyTVZT#?_w1 zFY12sw52&I8{M$pBZ6VwEsR~m@C3oPr^;t*#xB{wiDUU43Rh&OC}~CEqiKvy1t&S< zMbQNeXt~CY&|^K*J;>9+Vi`*VnfnZ`DrGspjEM2+miQGqUTV7>rA%bhaercOht={1$sH?lbvcl5*>OtPDLAG|&x6*? z%1PrgS%Qh9k6`Mlwq>r42}5pwJskwby=lh}DO3u?jA&@!b!L(*Jc-0oU!wCCl`Jmp zX4>*OC2x9`(BL*+|1#+McXVN0gIDSa+LQ}42>5ayXD?mBz zP|mNq1_rBwJ6~ri&I+bt{4J8LQ8O?suFC-mm=IXFijH>-f&*`qi>k5)k7AF9x4v*0 z)KcTRNVb);Fk@S?2A`NZ&vxrRq1Sr*s>?PVgx&oJX7IM1pD%e`&Az#$ih zC2Eu?&Uf~$k_356NM7eDTuM`{rc17iC*pBOqLb?bdCqmF?3V^3G<`Pd$PrdF!kD%Q zdF$gc7%<9UVj3-qVcJT5+}pndGrc-Fu1&h9iXk6bpPu|0W#NP!_UJ8hu}wl7*3vSM z1hMeMyvu9JPj?n}Ml!X5lCDtB%%T_Q>nfoGViuCQzj5PPXr zvA#3*rX#aDVH}^ddE0K^YGHc%cgqxoqGHQs)xKN2?FBAVBtmuBT!R4^432m1&R| zZ)Y&vr!Xjh;FR&^tHjbxwBrS5DHOJ57V5>ZWq?-#rphz#GZQ$|XnS-ST~4(hZQdnV zJvi<9?8mb(?oabCG-5i5YghyDBS6Xl4wc-OF4nG?MYoz%o8gK-pbOk z^MgkNJ_?t8(5(I5EIW~bcJhnr#GevdasegUbbU8NV58v2+POj6bKCfpuJYWUeoBRQ z1)tYtjgx#6^1Jq{+`v31E|FHC+P?0=4R~xSFXPU5C?8)n&+@dJZT6=zL=#gl_2wlJIGsOAhj|}H51I@wm^TtFioE7aQr%H z^zb)?U!)evIX##?&d`AsM&44TJ-B%2;v_Z29rCnQpy`0*s_xc$7gqbeD`uT%(=p#5 zJNz`--nUkOAHz0__H)_Dvp{ZHv1&c3tpj(kZWj|}x?z}@rbb24Yk%}vh`h>~eAIje z3Hs{n0eMaPDKtm`?s?rJu-y780)8De>Yr2BsY0WYx{bgT)gma=;+Ay`+B>I^W;IWj z60Mn#uP!bI))A9_C!u|(Q(VD1O{Ud!(sd5cD1+5nhN z?;agI^V1K+9}Redc56JkL}*`n%D?2lDpDJ>o^J25*mwL@2|2LDWMA34Pgvq~>@*Xs znX^PyC-s?&H#9H4&QsFGSFq$%_~C(1M#Qdh|8Y&VTRK10;(}eFaS%p;iqw=Q2rnS( zmgsFh8ePL=8V*~|n5h*&^szlt6vR9621&!Ux$V&`nRA~h z803TcWYK&teMg9xTB5e~{T{R%2NJV^^qOqB=N7>Fx}940!P`Dwo|F?l^4IxDZj4cNuLlxMGQnaU%oez$NTEdN1V90oRe`B;L%S>j&3DLk?Ly z__F@OH=Sotj1d_5Ex``pPba_)sG1n$`o{Imul`TYT_O?tntiXLw1~irO$>)mp5Nu8 zCf|$}gJVa(yfc=_a7XPdo-E>L(ESQ{xJ6h1zxJo2FPYAv-D>v29y$BGb)yhA8Ao2B z8{r0%1rG*H8S<`2U#vyLa{B`&JZ{kCb7Ah%1J|KIiHX5i8+(3tO0B3>_dfp2d)gp? zaj^{VZNVV|_S0mA4oG#%hr4ZjGxSGII%a*nm!I1OzTUBJW=Z#KB#QjX3b?)d{0S~+j{a1a0^UTa>(mB=05;J~t6F5FKZ!M=SQV|t%73;2? zQYN!b^4dtns@u)3lF%_)+Hz@iEFq6pe#ejV?sYpQOr`i`To<=Fc;L4ms~d8c!D&@~ zKa5y^1d7q6#4Jna!-M1eh6?0qb#)gA+XSmvVKbUV#2`*3O5{Z~&82;6Fn_^z`cByK zQv@uM{L;WR(d*YXzCanpz(iP3tT3b&*Vj>>VvN?>C;5-q*)#8$KcY7+_F}kN^faVl zZ1kNN!1Dw9lAzE{N(IVtJNmrntk0WHr47&Y=!K~cfksZ0;tVUbU`Zp1EMl%< zfo7Y4Z39Uy1g?K+0NbK&u||57KFKEw$O)lh6*l!SHayaY!iQvHVgFHI zi4*FA41u-<*}+UV4SRkd&6cw6*1i+gwU6tSO6*2Mi_`iwc44^`8OLEeMH*u^$uBhWA26JaLx=|_;6a$NzL7DlyN6tE9jN6C`_v-#*I=Qio7Ws^O5aDNq04Tu+5{Dg%QYUFVx%H_Q#0Xpa?11%XEr!AU zcxcXgYmASz-HV==&@R&8W|3k-%Y*;n9x zzj}aY3$>g52Hs8;?3KBlxCIgoyAFf%pThUm#v*J3-mBVd?lbh6Vc-cAaw?xLN3yBO zjUns;od0{J+kok@$(Yu*xXN}?cD{vpRPGb}$Lf=RfWuUL zv~dS7Ny`P?gxBIXL#{fs)V26%nBxX~m8@2lvnIC0QxAlN3AN2|#J4!&f}J0E6G=so zFmw<(V%^2$%`2S5hE$0q3pqJ&wWSV>>>`uMg@fomR{^0LH{@fB$BEFFyA7h-M6HT!MQC@;$Q>7a83O zHC69Fo6&c_7jAMdpuIl-?%vF-sXW-d=rAz2ZLIhF>yCF}g20aEo)118mIbTj^C7jk z3UW7_a30d-zH}C=4#QoR3byJD&d+Nm_e%)@Gl)I6}Q1=IZR%$0;I3&V) zrp&F>`y+zRSFuE;-QvyqJXuPb%Ma zO|In<^axoJ4gd0c2V?3Cm#8aVF2yQIVzAIWc&-tQRoz0p=lh6rNcjj#?le>oj`ax2 zuEC4jeJ%26@#v9i7WmE&&$Mzj(XkvQDSM{;=$p%@^W;a5cf5(<5!JWO)*8_a0291c zBOaxVsn~}L_0mvMvPYATRsx^PuVy}u;q3&TV2~iJQG>p?hEt)WjfEIo_HzFh4^fFB zwW#T*U6Jovu&fKsBhfB%I^VDZDVPh5A|{S(UkLYlVh2WnANLuCKWeM>&#~m1aZDCB z$#wCDkR2y1SQNrhIxE-xO&`|XZ*Sd!d5zI|f*Kk#=WBC6X9u!GT|#4H4gq00f|cJ~ zh?By&+!}j8IxDww>o68t*I-W&SYE&)gU-v?9o*{^gH6 zqOtgl_t<%)3$o;#ok1p8@gPzma>0<>2`R{qc+fY*x67eKOP$fc1WDD)BL9gXec?$l zCI0e2aL5o`=L-?v-Cfv2c%+d7W$jO}$w0d+bQpQ?PE{_Sch z8Xq1KvsMIK-ftw~{KrdmvG{R5x~)me89Tn@1{H(Xs`Eb|4_`G*Ecl4s5<0oYZkaZg zp#O~7h7Zo@e}+9D8NBjjmVHw22^KDc0Sl#rXr;0UY4x7oSn3RA%KC-6G6ANY&wQST zvHD{jythznZ_TK%haJRkCeZy3da<>uVg2Z2^9v2}&2y2tEzqH$QwAI`f9V`+vRToh zv_d5G`Rjw{^jXBxj%q$HP8?sO&cPFxuHQMq(%{LSO5Mohw~+h>zNF=z$ZLX`gf7@0 z?t1*7DA@aZ+@O5wP}CaEF;V)<`|{!-HY~-Pu%9K~Qw|>!K8#@o5p|NvMBAL+HpTH@zZE(>;cLy6o7TQLF9dfSK3*Lhsyb$u_S6Fj98`0!2Lnvhnk z8Gbcw>FqSZ6;IBD6OLYVbv6k-THVbs-jt{%@~~*T(gftxs)soGCt(ypeVsmVKSvTH z8`fNW@E2!-%@C7^_mN*}^p-vzXNKNs2h6Yix=;9(Zyn4e@RhHrPgjMc+swdVA{015 zs*2VXaa6Gx`v>?Z5qhGfn4L~TzIWfWKF95vQMI($6`=U!DE&C^;*BF<46hzsn>b8> zcS?5Uay3}_+UHkI&)uJ=uadMj>1yRX9^D_cp)$BenI@#IGlzp(tVBlH^_y6nH+<9p zV3wahybAPfVGUw4AyWwGTP#B!v7+^!_V81-_MxXfnf5|d%l==APzrCOD^*#mprJIG zVd|@+ZTi-;PAh)~;|rhh+4>8T$1B*i(#|mj7#GZI&`CsM4dL~|cqC%^HN7Hc(J7_` zC7j#0GOk_MQagB*%HL+Z=ZfU7`S_#^<^$9Fo?%`wWbtrr9LN6omrzp{?lc}*p4U8b zU_CC3uh@Bgv&@8ye^V1B`A<3+&o}_jQWOaOiRLpgqJN?|biF6LLz0a7PjnR?4n-bf z_{@RmA&TVmctn`k^3O1_(Ze72ynUD+S+Fra{U_g}*#5~^#$x)1#7xQU(|ZxMF?tb+ zW7C9JKTHI#^}-4NiPke|qJN@zHFrb|IdYW-s9la6*UOp;HY!X$(-GNmZT8fv<(1hVq0Qh7IvnB@{x2XG9ze?G?5+}#dX=!I67M7)N3q&7ByQf8rwO_F3MNrZ{Gh^4R{CqcC?aEH~2{hO=&5NS%3k@{CrYZD^AJ|&W-1Y zD9_}2tnnI)jfxLXjXH`mmviers7HTgQ>c!0KTQ&4Nr+($wQugpAs4^?h814cuH}j-JKIQ1i@^mOPBv zma{-R8_|evD7#vAJlEDJto36M;&AD7Di|P1+dQ~yLA&LUYDQ0;0ghhX(>04f@)@2f zJCtToyO6p0T@@2R?jNOZ1!nbS7B4}UskT&)&p zT@hDd8J$R^trZTM z+*0Y5@%sU;%7flv3==&*xz`Kt#|)DKxiDEVoje#w-f2J1v$ZA}f8r%n*ZGm2f#~@& zj_@s*xKL6dNA|ogE3-Z!MF$Q4R}c39R;4-1mCLmlht;sGKd5U;s;4>?AkjL0NmB9& zjEKR)T<^ry+QuC>sHRckc`j^$to`6@hl+PiNLOkD}><( zPKwDh0m~8Y+x+2bTS35)$$d2XdLG@h+66b=7b7(JW)M-{P+^*oAG3{jqU4uk<$nGI z=MKB=v$D23>#Jqv`^~n(@PZoUlv8}O&+(@HmhQ%+{eMIm*njvk+pGhg2uur>j=XHJ zZ-?koLB2Z|6>a67TuY>)Vyp~YkKdRUT z+e@nLTEIYC);trr>+7=L=lBcMbb4j0H0ylUZd;z4>UO2C02^{idbXQM(dJ*1#>P7g2SSc%OgP+Cb$d3Q{Bplv+<^Pbn@R63dlD9v zY4m!@T%1*^fF07m|80HP$zjaPpnLpl|z|{-C#lk>oasMZV>R+(<-woCO zZu>Q0E6^_rD-KbG>$TvetP2&&>bNndb+Q_^;mjf5JSQJb=UhGw#gySN7!- z`WL2%iaj*_<(?lF{I?9jhsKA}ADaIz>@WQM_kI6c@xN`{zcSDNY})_aCI0u!^Z(xq z{&VY@U*y4heyI5Wv!40>R{eiwJq!Kk*7H9e@PDwL|L&uI-;)2O_561S|3@I$G5JaU|*{n(<0oF1c8RQS3K)hWb%)% zm4+|$&GDv*xgB0c^8O|%e#>?BdxYnEbB;4<%J^LqNp8C+OOeo!FH7cYs$AMY%0HSz zdS@yOnr=58=W$3Oh-)q%seL%#Z}%O!Nny|ug`DncD|{@nK5rRbaka$%+Ne&vr)9@| zc!6PAv}g)H^YLAu%@ZY5qAS57FKrUWqOJ)A!ba7;tYb^;cxo+2UKfmrMh?2VufOS2 zJ@Wb3CuJPvXJoGF?>i4i&lmDr`Ax;fyPKm1dvxu;t+!8 zEv~`CiDcKefhfdP*LLvQ_L^e=&*DxtfQX6Fdq*;w*>m6(mAF^gbLv2hrK6>X0mlc|Ooqm;1;LdvI zwp0W>!^N%GBhx&7Z=p7LEMQ}OI@hE5Li<3w72}0Giw1iod8k(L`&3R&Zb_>V)@zvh zIO|&41E8ca&cj`)awJ-2IrmMcb8zn4PO7e9@XF&nju|xwX|@{shhJs0<-2;=!*d*Y zt1A-t-#osXth8v$fw}YS`PoY^X`o!^>Rbnsiz=Cq6X)OT+|L1B0#?o*Cn}PbjK$B1 zY2?xUNZ)W;{WKWYmMO&HG}lzA?Oyb)!SX&3FnZaGdrXp&D*aC4d#y7$ zEeh09tAYZ3b#_->ohXlGKj2%!iy7;`pjs4>v9;3^{svxR`S`NfluXmrs6umfy5gL@ z&gAH>ENFYN{?O#zBe@Q?qmO3^4#4zOK_t&O7Hh78>Zi)J$rgc;Y+r12p z>s~hiM#)zOO~l7?stu0U{R(#sZoT|by5@6yIXMH0J?S-+==+1BGkM>tK%j!cMp?~( za8Ev#h$~g$W$5f)2k)zG5zGCNcqjLc8_B5%`L7kIJ z)Dw^x2xCzK!`I4Ru`u2~S)Lk1?hbA6Mhkv%6z3S1q+qZ_a#fS~kFwPH-jCvmrYh&B2WS?bO4qMqXSeFng4J z71_0Bh~ONV#`Fw#iWo!GuVr?M%@wU>_W>|HxyB3{HCSxR6zv;~dn&XC_-~JpqDL=d z19Rx9!Ta|7Cni*z@zWf%>^Q#(nJi(fwjqJoa>DAy-6+toHb2y+^6*3(BJU{NDvNt;g z7TvwY2NUvn1D8pWO^L`N7O7I^EUG#BL4s3fy~$>z3q9;P7)xl_J?FLAZCs-*x^r6z zolj4*np3q-80MZGAIHyXY0wvs1iGB@OSYdII?QHe31nd5!+Rr-x&pHCU`1O8iVrIb zK^>HbH$Por)QD79By}@a#tuy>Fr>>qJWs56`~B*A;)g%}0=B*>v!S_qO#i3Q%;|+Z2J!Lf^+uCaqp-m zk^WtYJ}IZSJ`)q}4HStVH|$Y8l(idGmsX$vGajGjt;Gd|Hv!x(f)4KNB)<7(-Lrg6 z>SXbIPT4n;L!`zcC$Ts7U6Q;^5cfDconx|@GXF7_MLu(>BBNT2zK&tif<{|@5+XrZ zB0505Sh~wZzujXkz`5Y-BZSxCeIA_%ez9rLozvS7lLJ|54wbp`E?rBVALxM{ zGHnT0i-CIrbe`|Yt$&37Y6f#}SG#{Dv+4}UPCezdTaL>c@E8wFIoLdruH=}++gp}w z78El(Slk}kPUD3&4V{e8+&Ck54M^aEwqb#Kg@;p^5&1DzjNACbC{P5jy>hi z_dxUkaqd!~?KHcp)l}_yzVoGYusxQ;PP(J+Z1IY90c{1t9A*({&3Lcj`^8LU#CQLo z8pj`3d}Np7-)|x>m_Ks&`=A{#Lp=FG{^Djsu0PBim36CTkM7!X>7XlM|2;W}nx*!n z8u^Zby@0Muq%}|?#%+G_ka~q7vW}|eTBh3vq?Mp#@*vR6|g)IM&6$RT30P+vsM=n1|C|j<{S>Q0bL2# zL7ZI-c`42mQAlT;-QuoFd=2aZ2ZD3&N}K$PCe0DWGqMWL!Mof2QG1(s9QYm8SXXIa8W!*nb}@lKJ;V&|L8=lNmTI-SGY z_`{jrjkDw{CGS_Vo7n*x(hQ(P3$qoI**wq~jO15l z3y|24>&grhm$xP)c2oQcYb_j!cg4K}MC-uGkzHTd)Wh#N4z zqCa?Rge#=E5^K&`;83;S+E-_KQ*H%)&IZ@y9cVngxbDI4^9hY!`|thoX7vK?ft?eBJL~AX%`$4IpFj7q@=ekbJTF*UBsbyu}f1C zo+9he>jJh5$8F06^B!{^_~ujc*--;ePhd+#pBmipPIO^QeXr*SlxaN5mvDDRkjMJO zK0vEBsPgw*n?tRBi)A~xNF^1b&cuB*%hG;CaS2T^S%44_nd z9B9STj&uC(^k{~tKU18mn;X5;ykrM&)TjpM^qPLZe8IQR)9O=hy_z0=9NCfoa8R-b zQ*1>P11B>Y_qFvombHU>)|9dkzl9D9J*1zTOy*-#qKu)XWOwLdjNu;yH+tVmhzl(J zocur#4R6FsZ7Oe#kwSup65WE6yppCp#$XLRcXb?rKIh_R?-u$8^>g$Dn!L1H9!-54 z+m6ctZTg}g)XH2*S#sTWO9KpRkMeaV5gHM#iCry;-hbq83;J$fo_Z0kz1E-i7Chl; z1V;}kZ$7)VW76XfqGjqhGZ;cW8^d}H1c9nAfF3*t6Rb2;nZ1x7{tQg>r*=9S7>yUC zd;Rh|o;YWe1O1;YdF>^yW)6C|6Dz zVN5DCiv-K)$PYtxP?<$lYb;AMmCmk;ju$P7g}KeClLCEOz4z?(6gKT)ZjRG+gYO<- zD4xMi+TAW1e+&<%?Mst2nE9HWYX0KS{6B5FOqcf&_tfu5k2KQQp9wL81ZQ%z0o^9_qBp{m4Pn0 z;IS7n5g*^|#nCmd@Q`UQ$lSDMA1u;9eV%RR!^Isi;VdVE8fiz2J-Ko>$KMts`(Bls z%qG{&T()6$$OTL8uXc8iy<9swM|ZYpFmQAopk0acuEe>}Dj7fAT;L6e7XE&I8WSiT zkRKR&CZpspyoGvsAftHYU5YB=U9je6`o>+4b4I_@`RCzp=(iO9*nnk_+oH)K-W3KM zoCKXrxSKgwF7_nf8X>up@W5^s;Y7dGN7R;M?K8g9T8g z6Zc|Jk|-Ms--1){gBuLXgts}o?F)f5h5_A|Ck^E~kH(9L46|SDFzz#(y=&&08Rbkr zQgxKMOb@81FoAzu#%&FLZ^W&zK`zA(ajO=8)uu5K=aTMuoaupy?4`IATLv4mvp&C9 z5Eu{>m}uQ+?~&SUm95KLvo%6vHJ5(W{8}ur^A@dLPP18W7Snkr5UD4YcRo2ED5O&j+Uwql< z`AywiN-SAw7q5ZPM#@kV;s0Ui@tzGLd-->DUCEkaQ+Ma!7x3Jex8qoBBbH&A!GKuW zsr`m8u9JYC#X_s|LfdtVP&2y9k>B}7#dL3`fYzm)uOC>-!F1;q&f{R`$d`%v9L0tB z=^r90)Bbd|r<3f?;82P6*N?t~IFxNcWWHj!hlmK6zuK^*wK-$3tmHO~HYeUV20Ee2 zhU@=``sw~y{qovom*sIMrDj4Fd-Oo*e11byeSs0BM(8!~Bjg7@N|xw50#HVa423>s z!IlrDSA<2~dj9j4=L9$wLY=ItRvpdl);E{fTQ*FIRTssHUU-Zl4FtW%}OJQS`f6aM}wZ?m&}bv%vpyHMDm9g=e~0Pap@? zSm0Slqw~I+u1iPCd@qYEBR^l7h2eW{QHl_7_&mZxHy<$+i#s4{y;MeauOs z=9eC?woj$Ty!pV#x`!0#txEUDwCRvq)M`PFbTQW<6UhAA%Evcr0Xq8|Q8Vi|Uu$N4 zfCJP6VzBKIZ|OY1Z(wsufY6-ya~ReZFp73Q{m!2=;O(<(7b};HD==qQsdI3$19aRW ze6plcrd!-=Ezpy@9?j%H$7?=tVS6<@Y_Oj7hpCK?VY0|kXpZAV=H`Cb^E`F1F){hG z-i{X+aG!Ls?!&PaCg@_%JqJ`Ebc1c|LtS1n74vHmq8r62oW3`1X3qx|BjaX=uzfz- zi+w(HrTUWJ$#K<58%9gqE^_IBYZpI!uQ#Sl;`_|mOGR`vk9Z<4QqJ{oLH^W6dgVj0 z)>A9_B~X8HwEHE|>bx^w!FEOa>YL_hA$zKuxTVL3kjYiOR3+nx;Twr`8TlaV@-`k- zrWiKI;#ViBe+s@jm=`=-QNHHUCr*`4;+t@B%yz#_tfRA&sDq~9EGV3djq6Owe6C}w zJhg=N-F~l@=#Cct7;pXF;P9EYX zFhXOSm5&beGJOYGpoWBCM&R=*>%c7r3cB5qt42E;Nw}B#mKB8+EIE4mx?byC;OO>E zc9P5KDlQz!&v43BpAi>t_(tr4O)HKi2yiRYef(mpi{6TORModQtzjckagZ}nn%VdX zF}6`=E!{Q({}l9zZukecyBXS`s9B=fOLq?&M#zMR*XHS^M$b>yswZb$Gxp&qrykHN zEAz_hhB&@OV&MXB#}<9v^LJKaR{0@dirjI-5x>2jD+N~3*DNn7hnNpVP0QPSKiu3@ zr*t}zwU{dLb+IZSQouBSI6LS8v&zK#y`5r?#~MLA*%UJd3%m&7LMn9yRMxPe2geQA z0BWVT)fC-z^Az!Fnod|n&+DN4Y8TnS=5{|4O4{<`hF5PZxJ*#ROBU-o#yj3ir9E2f z=+)&K{(cUnNcO$nRbGFPZ<$ujRDqDz#XSZkZL`l5r6M zHSlc)QSO-Ran`g8TXB*V244!$)iv!svYoiPgW8`z9_g0kHqSdW-bD#_paU)SgK?*v zC1xk@7%wInUDSpOX(F~jPv9XfzTMSnBSeG_0fIC=IR!RR{t=CoYIF-J($dFT-;7>u z(P?Jg3|R~v{j$VdIG@PP)c0O{eH5NJ&eAqR3pFG^g-?zX}LzYF= z81CQQ={m9;Y&z@3G3~j&c?!UVTLf7p4iZvxUNaMC{60(Zmcj5df5bMhWIX(&_Y>jt^!Cr2H2Qg7%tcqT-mL22To^~ z!0nd76mV6BJi6$LvmA3&g!>hzhm?hqMLylx6*c@=_GTaXxzU-)6Bm*{&Cg%YpW;O| zh2Sd@gFE#d6V7mrq?~Ar97Gt=t+lQ8mg;!novfGgG^*_MGmmpb&oZtMAl`ucaAp#P zetI@4h_kM!4er|<5bj2Ix>0e4VT*i>1N&)vTOE1R2OPw?ac;8;%S;Ea{ zDBk2UL~m?JV}I1v6tes9EK+Y-JbaQd(C5$MAA?34mV11L*&K+fZ5m7R7j^*`m+W=& zw{Xd3&P&3}vKbcD_kFF>4&b3L&1rRQ$=;&Bk@ifE@*>%x(Cu}?RNW;bjx6Gx<(Q@e z4GG^eLPNy@s{+<|>{X7ow1!0|aM$45E`v&juxy#xrZRSx{+5GnI~JVnj7z9Yp}7DNsw8e)iY z$uR7kkg-3(TJPX|b??dbD58IO=eF%m^SSt2*3&h}W0%~`N$@6k{d5J`OP5MKhCdn+ zkX?=Z>Be@`j2P?6_2l&&XV| z^uCIOPvt@+0MP8ek-Hs?n;SU?W^d>)N-Vw8yFE(r<-6{q9>|i!6Z;|n0J2-DlgHU_;3h4JLk&&(di`=xHtdoPo9!x69gCyPGHiYDH z`NB)4j3xW<>2nm@7?3xye%0K|NzY(cCcrLu3^?UZNDo4bg7YD%q- zhsr0!_av@@R@q*vvbpA&(^elPi4yn+1;Q2+kmRsCl9lhE%L3m52(r4}br4q{ zV%0`$3b{}?7J(Ai`67S~?+Kb?vj|J>B+08CNt~;#WVZ#UTWuj~Rsg!w3RPzbEMT2j zfC_6f&H9f|-ktaWfG`qR4Ge*gwN{?865k7*-(Qysyl76WB@DGmp_K{;avk}w0ELz7 zO9Yl3Mbz5a7M&9x24V)Yu632(;ap3}w?W*5jl=!L>}t19B_esH$~3)jg7g|}?|Yq_ zeAPA*d{xl*;_@7EYW9WbS;xB+g&o(VtVHW;dOQ|wSGEw6ZAX??Px zVOPp*OYpkv4SIfUq9u)J!0d|ajB89#BJ_MG^F1CsI78m~)n`uAgK=9!Vm=ybFS)ICEqEqGuW@f<8EzowZ2zd060Y;f!AVLG~UiEUXt+=^KCBAx57Kg~kXZW7KU!LcT>Jy$>(~B?5 z8s!8&EhP((1MmAosmkoagCOH{^%15cl=r+xBhoYCZFq3D9kmp)szR>B%!O#cOV(co zLhb@bNKA@PVlt`Kr^Wg<+@UDt8qh79b? zd4>?<2QKa84Zv^Lqd0>z>u&DS#O__J8mSDteMi9A-`UBoyY44plj%1SQOoV^JP1er zwGw0xSYzm!(YEO{LF=rESaVzX9QfUBUMi(WqAw(9d&4+AF4$M&M+%@#xfzM{60%wS zf=KYmfKsVmBVmg+5$!ytEU@LfjC0UxB?94|%HNgQTp{6qKS2o4J4?_AoaAgjyw^3l zOjeRuUlg>>2W+gGfybpL)`!@!h6wlBP?xEos z`ey4gc|D_$AsX!NSdydPP8)$0Re_)h`fSgLybEzLX^iu8TrUs*EVy+)en-9z)%}t& zLy4I+u_N{5O_9X8JCz1m^GYCe;Y-y%qjKxdOOErXAH#4=Ya>MG@h+?y=HaRz-S&p$ z!rQ1=WEk=43_z4W53=LO3i|w2Vl$(c)lJy{FfZ1Ofl{u;{Up;q-6)4>4A-2(c&{sQ z!qbd(we#@A*q-c=`hX_c+W*H$k~&i4hEIGP#l6ORGvpj6%>+_jkJxwL8Fta|`i?vh zLE1DJX_-~3*Ux0cM;2+rFylshX7mjNq zEBH(-`{G$iXgEiVD%=5OS;C;0r`gO`twCT9M{pl ziPKId_)EhS{t)72rW-y9;3;5;%OfO+b!+s7hy8PYIi2U8%i2g;$r`LY%Tuv6$z6)^ z`*v&hQA$lK-I(Kzb6SwzuVs2=7Dd(@ATbGJt7LuPTFmE0p4T zLVqQ=!~BxEIZ~J=H2?hwIdzY8h3{}=p<4gmO;Umxg83}DF;uDY!%?8*oix++-9Yg3 zU~KUPqCHTM8ece&SOC(wG2DJ+s%Hq2m)WdLO0La?ix&}3q&=P!0|ybG6qXQ{;?qyU zzQNx2&~1a-9mu;XrU4;XJyBO1CG~fdqQWg&pBjG8$x7W$grNN{{n<%$4r@cwV?{f) zlj>TzqEVSY7yMLx>aT$EIhoDHoJB`9{t!z94VnVU>~igHo2Aiw&Kn9KEV=YFeq$H-H|K%V79Zh4%FN7 zf!}TPk1u1dh{5-xfOSk7AuSpg z$AskxR9XB6vGgl0nj0FtT9yU0d45zM+0AR9-AJ3QZZFuK&IyXk_ zc0kn;M&9Mnfn!bz%rG?h@59Ne6ebEeCPQBWV-c@@Jac)=rZ@S~W6G7cHllOTg>6+a zqU%rY%hWLBPIpj<5-2Q0g75Qrnuz7t7I`~QT;$^Bu6aSF%ZZdLSpIH?--^r4a<;$i5ucngHbd!*R`gndPI8qmDU_yW3glIs^Z03-bJTzfm{@2jU7}X!?IYom$k|+X+s@d?dkFSa$YmYg*yhqCWyahyCE$!wyrpTTRznN zHGfQavX*COxYd@z*%TiYzs*x=ps`HVeWC$uf5T-okc;89-XNqtr!+vc(Y;LXRu1fo zYfdD(7YC6NFhZ@T0CVGoIO6YNCTy}O4s#9mB&$N~!`Iv9`HxEq?b5iae2ZmgIshTl zf1KskDU|{S?|s?>(p}s5Awo zNbf>`NUtFx3L;hMH6Tr;_ny#04@e0RAV}{4LJ~p(hEBl^F2{;DXjJ(JxwrgoqZiMpT zN)DnS^kDb;tnHb<^-`k`U(LUfM&?8QSqo!fv-~aEy&=KBeW2zPYnvmlU#|E{ zXwC4?X-Qg%CRfSUjXBBB(It^(X3nQTY`OUNRpL%dOMt3T7o@%znso_5-lyHLpxqG? zb?LfD_;zusC9ZL^L0o4PiGAw)_M;^{w#;Teq&t&Q$)J0K%ZAU;3_I@15!>A=CG2{k zZ85#Qt$<8n-h@l9oR$Qdj&zOv>dWh>xJsgNW{D<&Pm2UWpXCKlWzrtcKw=#Q-EO&l zZnXkLT&xMA4>f2Uqe__Bu$&t_JhOZ&q|I8PT>25V@heH)^R3vfW@BAf2wgQTAyy_w z;Z>kjW4zRry)e%NA1ss5M0PVDO>hU-TcqXA#zB^K5IU|>sdrAWYiLuZ-C`Ykk8Qf- ziXy87nMvWf?c>2&i9N>PYYtKpewwfK(+*TCMTzb7S;>MyJ*jQC9zklXeZRmuay|$0 zRI9%vZamyz26ZtIwr>+QbO}AZF*TBTcXeqg3KM?;^AI7-VE%ITKS6b2Mp6|I{iS6_ zUf)=|nEt3m*Qu4SC)DxbcNv?BW*b#!rijS!VfJQXu|t zg+I4vp$Go*9>k9WO`ZY0!PlMVr~Gu%%qhyz3(;WFHNAbtQv9}7l{@{g%{?{!)80vf z;V_HFQI;Jgq9@h1!bVsPSEexqZ6*r<*Ssy3~@jIL9O zLXBNF2>`I{?7-EI%{I-$Yq^i^&RyPL06=D?#Cg7aOysISlRIl+&Jsh__`kG!-QNf9P|IW`~K(1T-ErgaMH0yE?Y zT2_(ZTxv>9)THmLO2`MY!FS-+i?G%x%1f%=8`hCm&5~>Hb0ij9m%3_<CPpD4g)9?0E zq|lL=l)vz4D)_!)v%yL1Tk3}+YU$6A1c4p)X!w)C6mnjzu-+LQSu zd{!5K@0p5My_}9*N)X}Cu_7HV-{H<~79wNjM!yY{o2!8~KqN3pUflDB%fuL{^9Xk8 zM@e#J=XC@==RXvML_#9wxFE|Gn>Myb-$L#H9Y7<1x{|)sOUrNztaVLl?A`gppAx!w z|5^lESSC!~Lu`Bd9>iIV1?Mkwd+b{!x#K0*^02{fdnB55`1$(#&iT9fH#lxfhPM=| z9T?m<0E*gw-;q8OpZvL&6Obx*(WOZh4LmGa1xCRQ5tvv3d+3EWR;qr82}~tQchj1W ztn&AJ$Wy-1@`%JMF<}SKe>|;4UzlCId-`@%_<8THO>wIhtATQ6z;oP}Tc}0$(csfF z399kZW_HJA?i*C4PTHd!D&+A}$R8_u%P$aa!ovc}k(|5yJ>gvg*g1KLpC3A<*INw# zbjm*SRr*NKm*E)1m?U{0WzZMzXn2$Sx5#3UQ>Qw?hrYkyXkpYyxUlVMRt4B<)rC@^ z20mmy3Z_LWCbbD}qsTH3qA08$&#T`-_btQM)wd39<;fKZN*_-|qB{Mr*M+1bUVExL z&V8kRPNQ)dewXyZa8dJeB|ihn^1{CO^#5CE;-$yNKN6-2h>|3`B(Gd#h0*@}r0_v35x0a4(E9qr#l zDZ-zvmxOqS*S>xXMv0Tv{x?P>jc!uP9Sk;z!Z}RS_iNxF=!(e6smQ>9>3o#A>ohRo zUasCn!jfyqb(lMeAFkigO8-Mcx;!yyREy<0M0T&t@pcd{?Vb9b`A-l#>1G$DAbn-| zIG7J9`4^pCsQTs4;&(nNYro2yfMS7*LE6*AW%EyOaCcX@Y}{gg{Sa{^i>&H;sd+r# ziok0)?Lfk>1Rs$_Nq0zXn{7uLEbec5uyN~lHv((p(Y93T$L{3)^~xz@Z$*`L-VhLpxX`5D8(GR=BckqkGfa5fm&NcEwdLfhIA87 zRlYr-9vLKX`2?I^U6m0?Z*OyJX9P<;2$DZj{X5g5MlFhE4x`vUSk-ApeW*eF+^8G1 z9^WjvV^XzXb;o{uZ01j-rm%Y+W@0yNpIo?(>fHSl3N4A;_liSEH%kFTAO{OS^`KT^dp4=3S<6tz*2-JogRMZ6WF|+H~Y&m(pO8$t!i( z6~lR8_}13ufq#+wb@LfP*LB2;z$aTl?%o`gpgSy+9Y2w z*IY(H12^`0UYr__=J>9=ZH?q4r9BFuoUydYsu&B;-cM^`)!u&pPw2d-Hhjs#g1>DS zwI@BsJ!jylqetW7>e{s4$z#Nb=Z`j9QplhwfVurG4NNizaW~c-{zAJJHN6<0DwAeW zawdK=cFHyPIQfzWw0(K15~-}ewZX{m^t(1maqcZu7#%rH6v|faQ)`$%R(+dn{tQ^w z&nhZ9|HAu|xnl&G=+eERxU3x^MQ(awwp@=U&M^d6lhf-b8tAe~on;Cjqm+G17>SpC zqv5Vc?68kUy6>4e#dnQ$;Pn1x+Y?@Gt_SlhXPF@sKXZ70#kTR5KdC;9JOfdMPm1qj zJWxH<;KLT48kpJWwQ@i6(kY_LRxW0x{12!{?>aCMU#Zy0WM%b_^K}oJ z!zYKwZ@J(14%ROhhKc6`rgx$CE`S2&^d#p3H0S(=v6kX@q@PK@sr}`qfqR{&j6O*g zs!R>r{?`gyM26kgpP93@Xv{qd5_#4vo}^U|Wt)vIXbJ2AI|9cem)N%y56518Qe{7C zv3^>R6Bwg}7^lXQP%U^giiW~NFT^E@?o=dwyYp^&mw zQ68tUw&J(8q6HSxM3@Bd*teX6n+!ETvl*$siV1^2k4y+hI05JV`^`_F!fwso71BN2 z#+_TL$~c7snwL$4JhP9BU!r==$BO}_Wxlst+zTUJ1`+}mcVyyoG__=9)_l6g$D+cEzH z=Uj*$97;F4!4&Qg#|iGTO@j424DI^6ltBFxATf>dVIJ2<$k2V=7}T!9u+Nft`b%AL z$Fbjk`xTe*c~@$lYDFFUOAYbKpZpyU*xL=r?~aFP>`{0)h7@tneilRW@5F?+ggaQI z5to&69{q_>*Tg_~-Z4-3cgzb@V^4SY0Mv-3D|=T~(ho~vOmBWccpQXZ`z7YYCFdO? zRrnK5=$3bmvl9op^dLkSs;~kk5t?~(|g(v>(1#P!=npiwQj6; z|S0I+GsI-1F{dBVYIw@lr5%eAR%Kp&%yvaU3^!tl5V(3 zTlgr27RT1|gqwujuI|jVGxLgpVUt75uaXN^%&m)9U8ejp@nzC;6{E!7(@cB!IRzw* zR|nFyc?}c0n;tx1cgtH@fp6-zkQZ8oSdp8v-TRT#2yYcjI?sGJ_qQ5*Eru z^UrdkMkeeznxEYVTHeQtI>Ok{3x9g!J_uE^Bp6b5XMDQm_?n+ZSVp$?N4n>q5#-j! z?hJvY6`ew;d^HR1d~p;>!8xuWsBP)Fpv${w;nr||qK)cW;IlSo$~QR!w-2gUWck;Y z&RF-T5$jUUh=U!|A!kF))I zb=_xVFejm`fFxHNAzBc~z4~m2MrcW<3f8+SjHwlQpU)eG_~7-FJJRR;%?y5Rky@+0 zR9);c1L=TV^8*j!Z{Km$3uc$Ky>jjUVT|0;S z)z2>*1P(3qiE&;(Y`WB>Tps)hYW8))|5)_zgB_7YlQ7akqtzCuu1#yn?B6P7x(VAjHpu1B+~7(tG}0$2F3- zt?8M99}tQY-mb|$p7~ka#KOLaQV6ks%r{(lVbgQ!%YESyXX&%yyC-98J${LFfw{}Z zM|_v4_2)P_ur}|A0)PeQoC_Mpd;mz_JsvMAlF*Hi^|xnn>01aJT%32^d(O;kK~Ifa zW?X(^Y&-H0MtfA)Pv6_{5=6?KRni?0TK41c5u}1gMZ>240r}Gl1siHp`7^f?onD+e zWk3}5d`X5brF}*_pCw5qPjxtvcdt7qtH11Qs=3puk^Y3r^E#G>4aYKI`Yed04^~tt z$2f659@G5BF?LhH0h{woX_d#tZR%xFVQHj}DV13;_Rf5vqQmZE2-juDQC2!y28f8| z#d(_S+$RSZ)4DrXrHo%pbLe~zo!VjO7~@}sN=aM+8a}_#RQ5nuFLWDzTZvEZO!pn{ zy(16AGtE(=1`q9M(CWht(F1Y`*I{1m_dr8!0K3pgnS(g+Zbb`98F%wR(rulwhF!`W zixeGh#q;Etix@|*uAegj@CT;zICkHcwX|DNU6K29q1P`(_uE@rwJfHS(9N)o1GAzU zs{iHc>G(U z<@&z>9-=nb6fepA_vPJLi6g88XFU_};NC2)tv55j_?$0{8IL zy~o1Nt)w(O=3ReG#ED$yr+^dvkn5gr)1sFlMh`-kD=@v={uXE83#XaWTzh=ePjb3; z>{jGnlI@OIcbu%#)~ONo{VC0QV;33*tQe!YMCec6Q+B_7GQ+}Sx8o`Ax0G`ZW2Ueu z&&-Fd;=*UdpBYZb`T#yp+ZzjO?uDx}?P3wuT_yFawsNv1t3qpMj^&-4AkpZ##V zjOt`eZ%t5sOHX7fmEy?X9`QtWFw8OTFs$spXWBqt;QG{bb;_HN6^j=1cI5oQA!?iV@Eg1Z&n5p1EsPrzTY(dQcQ~I84ABx^0CZB+w ztTz6ghMUL+R>Yz&P=eZ8Uc;#cts0#MpU@9OXZb_LTu9G|5~mCCMR~~bB%so{u_)#V zXxK~O$^0D0#uya+N@f_QAFLtKRtlr_Z}wT?RqtQ8OMZe=(w_J$cJ!6^bB<>@3^njp zY^U%5ow+-2?XE-2cKO;Qv`)rvGzFcBGeL@77USdu77<_Vd-IF^zfdFIEod!lP|und zw%{*`1{Cx?N~lD0@%!?iZ=S~VZ8)P}vE|Z9IN$ngwzD`dCfnw(&nJr1&jz>-+&7&o zdZ_kXVdY7~TmbrPaVS3Tis#p{)T44IO`m}jV~j3TVqD3Z-ytVKb2lu_I$^U^Z{Gcz zZat0T_Zbe$;B;~{$t+H)ZFTWys)c*}MIp#T;)WCK?#QlJ7FU;*T$Ie7NO2m&3@XP! zMRYY_t9C?eowtCRGR(fo*tMg3zYD;Cb5KpJ>DP;iPXB>%cbFz6xii|!!}4d?nS0=T zJ{@!EQYRofIh11BPzcvWWv7UpSTRj17jVE7H=zcVEte8WD$FwIDV|Bf`DmU-2u9wf zXH`rLP>(`!HdxFZK5t=bV4sfO2Zej2YJ(3qaB3yE<*SmoA)`LOEN}d70Z#&`2a`E_ zqmj0Lk^MP8IMEsMQ8I*Ec>|(cPit|Rp44#DYX|(EW-TWdQlZQc_+jPF(MZl&=2ek) z$M7IsqoV*Cr{!1dgc^=ao^`qoE0@N>s3R_|@$-ch|$(MYlaUwE+S(6At@ zH<+q7HVm6@VfEnj$mX=4uk`O&fWGhgChiF`Xq=j!oa^C7G*hdtjt2LyJ={cuqJ}Lx zav+=pQYE)~zBnGGng5}m zfeElA-VT;?RL^Lh>*Z{TJ@Az~`siP%PVlnl1TkuIQs&=6L|d2^3P-eMM_w-oL)tC2 z;My&E!ge0~d|kZCvqmib;^yWX`ftS(nnoke*QF^9skgZvVEY~hBFcK<9BUmbvajd! zUM{=vj)!|TB}hBJLw_Nd^)2{i763`N28+w6alP&`{I#d~;HKdIX9MR#C7Y!!b3^4c z?WJ-&*U!D<2Eo#)5Z^=%6oUn+|>#Si*N&Ku^aLnDXkU`t|(>cLe&eoJ$xis({@oQzIU4 zD^V*YQeAGj44A6wY95{2QVvj1i-siLia%P9a68v!S!iYIJGUu68mWT#V7#m{;(VO) zsSRRYqfOE7D$CA=wQFMEkwLG~PRHskLvsF#X%%=3N0Q7)mGT0ZW_hX^8fD(yq`QoI zjNIzr+;K8KCWeTl-P{8X=?&R?Sk|4TzsiR=z21f{v}{5(HK{{aDY6XIXesIy9QX)X zmIpbzep2kc0UdkB*Fte|=0i8Bo*=-rnP*%qTvBRB1_DcbpTGh`#owr&d62Im)Ydz` zc~^Vs%#4Qtmn9;&CeIg@_^)2}c&^My2$v8Xp0hEgj)z-YJp;Ne4!M$CPa$;V(gb9# z>p^csW3h>-y{FB?#qR|kMum#iacJvjz&H#xvt;C_O5RqimGd17L+9F59Syv_^$7j| zaBFvv(r{3=!}{m8itafh9P^;=M_g^g2x@bv<;GB!Szfo=aHF|iTX7=f%65-w?{>$A zUCDE26-RS%-&{7h!nCi!#izoc^h}m(PR&G66ucMv#acF4 zh`jRbNRy%%>+#(*RJVS3#a)tn}cbcIl;{1XTyC|()NNb%u2v6q}m zF?f%{+|h7Wb2%P2H8HM^r1&GD`z^nA*lBtXU#FrI=Qx?eG83uSrn3G;Rx*gWwZ(+NRQ%knQ!Dt_@j z$}%MKdjn(Qsqz$3ZOcr~3gFhM2m#>V*48bKIO7C2wue5q!q2xfLN1`5q-Uj{y)cJ% znk|VOg2OAFt131^J--M4$$1hy7s*U|u)K}g18sDr`@w=U8b{6^k%#9!aHDU6eqc~b zr`1`$YLP+P@Xr%ak|DHeY<#{$iSh-UI}x%98}+7Dc_WKPwrN>UtPa%;uhk$XyTDxA z9oPJo!K2d>j^)XD&PS<&#@Ldz#-~ZZ3uHs)-Ld%!p$O^gRGu%IT}@)LttLOmN~V5i$?*dBSl7q~>~ z$f6nk-P3Q}KXZE2{?GeMyaFq}9i^ct5~l0S+V1njiS;oip2|qoIa5@A`jzdzVZpO+ zxBox+atw(H42%E2uWyOU!THVDedsaQcYrD3W{KY~q06#ww{&L5do9J5AGjK1$e_|n zibnX77d&6sK3$veyg5U*Q3AO}e?5vDFM9U$hmFJ23C|}_gF5;UH|E{7LC?Fs!<0>3 ze<8bVepyN#a{clxPqe8Bcixrg_#RfM|GaXd=ld%*F?{8OJHNT^Q%Zufa5;X^9nm!Q z)2K?JJ7cDCVD@2+>$|Uo1BKt3Cy43vsl8i&em3i{N6F203|orSr(nFUsdn{-ew}Z{y#hmggS;;NMLA8@9~9 z_5bhj|F@ut|3}FGN6LiKIRryoUQV7tTJap9p)C7vi~bWbq4aNK|E=B8!-qlq97Dk% z`P|3W&BgwahqIgGd0QkkZQnn0ba8&?^Y4w3kE6TszdQ{|Es%%LIX=Si+>_>iArJmD zhJTqG{~5!7i5;nDx^A*_5ExKU_T#ePErU<|r|nxT&l264rti3o zYf{_L6U0j#H~eH|2OU;_f!)H9w=nl+nPsm6lsc6q9IglPg0K0~jG#}W2i!#xKRI|e zjBphzTzz$q_At9&d7L4Vo+G&R#$ht=cNQUDfC1mK$Ma1dk*iz}^5m6x14e6}OFBJH z=<##o!vQL^38(h~Q=dlJEU>n2K1w3?-*h=&{rsvE9<=o5VmKLL(o!Pe*Z0xe9Lhy@ zeD*UFO$O$S&!xY`Rv*Ij<^Z8lK#tq`Z|jY2d;AS)AG~O!n^}@2kmwZ9^uNFRveN%= z^Y(9o|7+fM!7fRT+Ma8}<*U0uamzPtZ}70+=55dCXWtr18Up{}(s_1S=!)1U>CTIc zVX+)lJe8lMqs5};WztflnPj=;u4<)(9oWXXYdNKH(Q!&nJMDB;(Z|IiuX@J`-?k3% zh4c7YmwdiNh%fbq4FuzywEpAh9VS|_SBsT6`a21k5o!JB08p?jS++kgeld(=!Edk( z+Ipmeax_y)avTgjNl87OV+5Hgdy5Tfg=*)(H=ZhDjuJ7gt4OO%$NWSuxV3C2HH474 za^}@!P5ogGg1o_wAwj8RboG16icaVnt)gQMGRTA7BcN>#ITtiu;?WX{?9;8^8J}G^ zb(e&b>ZwPEZ4eP6^qT@`D5*hU1$5geRJi6oNh4c4l{#3%MZydR9l}%bs}NaWKhVW8X^iua)+2Iq>Uismgp6_KjJ|x}5Su^C>#E%;KY9S|9^8lqcT-x8` zHCQ@`k8x61vHK&F>7`znF5Z+@GgNO;F*+1yQ7c{}isBpI>3@opA$!fEhN5;wx_m&G zxWq)XCXwi$l{K5=BGaZmM4TEX#+R!G;p^y)#Vi_@R}Od6kr^`p$m^$GUT4C2%Bqbm zj_(y4z-KE%^@+GYy$nrR^Mw6nhy+Ts0qIQN8OA4K;%+_d>4hg4d`i}4hS<$KASqVA zJ&P^>2C>H-ff+>%q6K?JS*1r`KLgS?~t%=bk(9*ArbcFwF&CZ5NGxgitciED7qZXyQdOy!SAC7t z($+#~-i?VUbJp9PX>dVKG=qt1$9+O@Ft&{~{%2=|8^AoBk~p+~;HG=u!TG#QYc**8 zR8^vRgRdf6)#shUNK2Ji3ppQ2DdACBKQpSgle!L})?iMA^r(xLEL6%e9v_3jN11WvX=UQ3kJzFy-HQ_gmpwu*ek zWuu?da0$2?lx=N(Ro^Djzh0It2LjcYN}xVZ2?bNAA&mHzO0!}9m9 z8aGETXY8s}jAPS+l(8vf6wq6>C%9Wt18Cqgr5ja+K88mW>g5l6xn^qxsraC^IL>X7 zmn$}A+u}L^FSK4;y;Gr|g6?WHn&r+vp;>C;g7)vW8qpmuuP!F1JG zj-I1xFO;x(HToy`3pO>D&^-a(qdB`4K8IM@<90!7G!&ahZTVZGQQ_EO`cWG;(vN$}8hHv&k;$ z=EuR(UGb0Z89&6wKTpF}1Zq4p@9?4Dwbs{!sP~oXPiQaSq^Nrwj`|Gc@X6i7^6Zcv z1^HF=n>+fl_cQnwxs_SnopxQVgASI(r!p|3GpJ%Us@lFxHAjtqxrE1YznF9Q>RW|A z${+2d_adEY-q4rE+WS?k#%JjIz}h2rWt;Re=bDY_fFf=TYw+8qDz1*d&0r&)z3uEt zt6l7N9`QUGuR&aerizGI4RJ+BZ(s%f9|anme@uX*b&dPx7d)#%u8wydbT#zK#{O(< z$QrAo&2mdou)B_`pSr7Wg2x@wQS}`C7Aq6B!KUpIw6f^+&$)YeJ}(fDWkFx0cGIcDnwXPKGOt#`KKl3Z6S(bFfyA04 zUeZk+5f=;XpXEiYT5!0)>ef4x7Tnr4f5d$oF&}!qGsrEh zk-hA^mA2Hh*%(%+dw%3g2Tmw$W-9kyUld^}>UAp5Pp#!dC5$+5RAF!IN!T zXG8K<%lV>J622r^8CGL~f?m$?$ec9}+>yOi6e*Wh{<(C6>@wT;RUwR&G4i!JcXway zBewYPY3nd?&>DBj=k0jQJBde7>>Ipf5Z}a01VN%sE8{lLew5U!@a#}0B2HUv+ekv` zGoBU=o3omZb(&IVa`}gHnnWMx>E@cr0OJjny7BUs;;1Q>Fu1J}pIIz`DWDmF)xNUv zV{6~tj320$#AL@+DSBWvXa=*wuvdRPchEjpGW4breJv=(c-T%5rv+D^cEw~)PUdQT z@}T<6>3B!r-nq!fPCN2)`a%Ec$2B+u`l}SAw6x7hIa>FK9R~e2@nU zJB%rQuh!m$P&Bjt6l(z^BWh?KEO`7N0i*w9Kb z86BuI2{nShw0{wNFS}A<67^ccm{L0Y1VTyxVhr2`q$O6tE%OzM8Qk9El;}N|ky=90 zPH4yMj@j7*om4bFX);?rXI)o7vg+im2Xqyvim0~JrD<+WD?X}z#QTrub9^#X^jE!i zM63>a1ufN7Q3Jab`n6s}Z-K9iec3<%Ah7*-!7Y#{cIxILYGb$l6hFmZxzPKqD&vn! zzB;J(6(|)5%E#+fb)8h%;1x{M`HVpgh1*xr8*fQ7E4j7S_XBJ>yQO^NgX_j-fE0MQ zgS4%?rs7%Q)W8*t6FyNwj#tLw9a7tUha*NHnRa`WHl?_^=wRP6g>ZFr1{b>~ zT+%RDIsbupvGSWcjYG{`WpCwhuW!VF4WRu@&rG05x2T;rPw(CJ##7h9@cB76%EOS$ z*q2jgzbO;S<_*=rpRoFx1&4gEE-R}2M}t?^%xI;b29TkRq4d%tQu@@wNW!+e$V!mv zO)v8nnU#8e>^&Qd4ZSc^8Ve^!U+~lxu!WIsKPYiALTZ$FGs0y#+%YJjGFD3dQjKvv z=rBtVj%4~h@zaPqo{qlP4H@&%T&Hz~=xc(v1H_O^hZeslnxri(}xjR4A zJKS3{rD}KD;p8D@6_=?iDQ^|}I{0_7Hx~bTOo=u}tgK4lISCL zR$keJdUK!rI+yZ9t@XMWx^?d47p}s2)kg6orCR-SYmi@soa>#EqDdgnZrrMT6uUzy zQN`cibS7D=#H04Nu{F*IAUSP%t4PD$1#{eyhA-pq9jjnu?Pb}TD&3_WjAWUUSJ_Af zSc8afO?Elm?hw?9^9_o_tBEl5Av*swzD)d3XFuc+?fLt2h=H7k zdmI;~84vupRew*Qx%%269`1P9sUGlIKh8WoB7Vlc z=tHHxYKav885v?7? zsBc$!&lYDh7$l4Z;AvD?=3QAUu6B#pS{n(Xw)IV2Iiy&=#j7iCJdAXDrzY}+aA)Iso) z7v1QNBbh#`N1O=(d`8PV>=jplPZKxtpn%@7TIm+R)sR=EdI1WLYCSy|yn2s)9*s}^ttwl>l6B>C3 ztN$AEiyn91BxbF?E_A->ud$b7yO0{||9O_DFldf>kQc@sZz9iz;@Oj}FyBW+^g{y= zWE?a`;-gTK{J82WM}X*F4w|qWEBzo7Xl1Q6=e?(hqvd&}05#p)fMYqZM)DB1FIm6K z-6z65J}7j>i{QGzcG;rAYlz6e6=@!xPLZw3Aw214izJUh_pR2eW)qW3E=G$l#5x2r^$|3|}Nm1(eK2VG>?+A}W*?(O>l;^F_d+K9suG}pFE|tp> z|Dj->_pVsVUtX)}W4TFWh@Rymx%C|@`#7L@PjfuyorYo0u+0SW@Tmo^JXgUKup`=D zv8fa@?i)Xg2&Fw^Am7T-iW~^yVbVxn=AZUo1m|KPLOqlDfc!&ugxCR~UOHIgE{bD%oGAJEdhH4^nEAe zN#y#X+r8e#QIVMICm)HXJ3@=zUfrR1G-P-Cj9J^1C-;a%5eB`t&2GGTTlaHkJdb^YQH?mNr|rDXkD_o2wqM z5v|2kODRmEvI1U~Ptc~B(=3a=1$sw$Xh4pMr@OaUIK`y9N=bpXZ6nidrpQk6s_IoP z#mCb1w3PPa2|bUUWpNf#)>yUqW5D_~k?vqkcUG*z0gO9%v*~3_W_h(&Yyy-uP0CDY z>I2>;e6`Gvl3wT;fo&()1omMp?Zwj=deR&wBHQ=A# z91Kbm0k*gZaAd(F#xh+}E;=ss*73k- znk$ATHQn10PhwD{1H^UV#|X8P=|!IQ%08M^ptj0dWaIkr7+aw@iXf#-ZeoDMIriIc zJNBM@Ql9nRt=b82Ygg?3xE#cbu(NE(8TWf{SdC92$RA_^1Dd)dmppAOb8;1IM5agb zbVbFIAEhr;!5%sXs1-kcRndIpcAL%j`r*2C`faW%z@eJm>~#M7($v6DF(-EebFCH5 zhDxxqgBNRWX&mdY+TMq=N(j_LG0^0?e}b$e_MLOJ=d@Eqr>>{aAi|V$(0N|1EGvzd z2w!&AYgl?tEQEq~gdSV}^)J8NRPhN2tvH`e7o~X;MStzcSt3@BEENdhTMsb_p zt1Hz*;+qB)814H2A`8F^YDLq?1G2;Pqw=mQ@Pg3az{D*Zy>ztv({nvfdHBwX&AJRd9K`E}UGQgMoaGlTa zcm-0-r+71OUK*L4TW>9LpjG;LSEmqkV?B7bNzv$vF8)VOYWivR&$f@UeB+5Ht=Qd{ zssPwXh|N%keOZAe92==ekKBp2_2`_k5tHVx+?bm0!@TMJk@*aG6+?BT~9<Uzl;Dg}`Z% z`i{1jwZO$>G7E$r+31F&@=bNA-ciTOyGEnv4Zw&&SvF-v_$v|0{XoDz zuW8eA=OEd1orbaHQa_qNPeO5?zA3x5O7_rQk zIMv58`fBnExQ@w=;)7X})`^1m5Zm{*FO!M68jnzx%~0m2^PSv`=YQwfnBwdaNVv19*Cd^q6(=PfHK6ezp$AZ%zTvFpebBul(-rv>K$kkRWHpPy? z0jphWHk1Q=fQJk(_T;?)*gQ!<{(5k50V-fW=zVq9ovBB8)c5|m z&|oBBo=$@|cvoeXclNF#L_N7w7 zu?gEscC(=)`Vk=#RUY1I#XBM5SnVdU0e{|Mh66?_jQ=Zc8u9CvKq(p=MAt$)f59DK z{pIwPdycOj9^*7y<0ExL z0v7}hxz*h)@-*Cnm4(9(KF(h#b7>|VJdFE%sLYjOhBoXj>M+SH2Y;2lzZh;$QY!uu zsyWrl9>x-L7aHD8O5snIOuW*2(aW~!P<>6^hc;-I`6*xNnD>5!ca$EAdGMx9=7@9P zY@U);f-H1!g>nmF7N|BFDzXB_FZrgD{+iMS(6a25cn+lzU0;CG6YY`nvW*aTlU_2g z%K_yRn8382cXp{ie0eBTEwAeWI$vUv2)*K@5s3u&^MS2QR9^FYhZ8$Y$Cc_o$S0$m zN^fp}qI0U|1(=>@UgWYFlE(a{_iQa%oy7xG(4Y|JKCEZ1BroEU#)1Yu^bQsma@tve z;x)H@-}KCv&+zM!$~&*?53FxbmX|41C%06C>t=#6FtguN<1zE{Xi% zrc~ZZ)+^0)M=4;5C$A$h9k~K$vW#xwB9F)XY>_N8xSWZ>l+}UYNUM8$LdRgyty>jq zTUA-70Pet6x-k9}y(@Om-4&XnBzkQHrD)CH*;L{Ya;|AYA4aHs{nsnrD z%%@)8aLQ%5KkNyNBmK=8T*^Ux{0&TW?7(l`C0XQPd2H`P+^Ac3-R<@&#fi}IpUJhG z=ryGE!}2U7?uJ@Byga3L8^sn^VP_$l*GC`ELB76_V*Jaks}573wh&!a_Gxc&Q~CHP zRO~1o<%_M6|DuqCQ*}5u(&7(7c*pIEW5WVGj#Vu9L#hQLn%xi`kt27zRS)*8aBGh} z6h}P}mbkf&Is4E+EZU0{8bfIw0I*^4WAw*2cCel7T_O=0?XauI2x3%?MTVdTWYGJ{ znPiAd4C_*anuzJt03zUCZ`&hnqPn$S4V!}l+7>-KJ?XLJGYGb1Uzl6 z;n>+vdtp;@b?Drn5UX!4c}%0>77r`*eKTLPBc-t?J}hS>OM1l6qf}(C-{V)3k2qj_ zQ0+ft{2kyvGwi^F7RCyIFH)>Z?T?49g~WM2KuFUMX?Hsv#Gv$BO&rNcY#V9*3ii=x zXlxFNatEuIUf$4j4*|SbVUi}&DzYjcPZU3uNTz7V#a3cAiSSUSeIPH{H!IRNo#NyH z-+t$1aY<#{o{*qO7__P%GD*CkY#&FoP>1hTLpvO&Io9xO$lyE+{Z@i^X&l(-C>s1& z!pOHavcG-33O=+ghmD%SYPLo#8*pk&ajb!-l>(^W*6G9H!9Uqkwgd+`aVhIh&pUic zYPH`CVkIP}CHuzPQlgm2F^ibN@!+U>$U?xh#kqINpObtKH)+%(1e9wDmeNzOKd%yM z5L+Fv-V+pc5P_9vPZ>yDk0st8?EOyqd!|VTV?Lo>$fXGh{q5!vqxyD360xf?!HlJE zJo}j+UPteh#{b9q=$9Mvbupm{Ib8Xgj>)8$|EqTd{+egc_3sBs$vliY>sN5*53BoBWg_nCJ2JjP)$`roOc*-Okywj?E9x5!yHAJqK`Dwrhz5zb>Ts*_=qY zI|st?{Fn9Se7Fk_y*JQST9I9uq&=nQK~M2WxSFS#Ab#|LjPdmubdY8Ls!;F{- zDsb8bA^2S(OT=73bgE9!VMWJ4aYi4Rx4Cis-i)|JjdfZLIx^P~DNe_{+O6;Eietb;b(v=3rHt|bf z5q|Ie=o;-xXls2SHJ|^GaupIU0Hs;Pfw~s*+t-_9*eKp=%icf1KWM-kuat}W0g9M# z4nX+QHi@V*q=AT1%2^_xZ^NRjgI!Z+x0<{*c#ftth&Y;{_VpmrJl$Z$=Ilmfp_)uZ zBcka_|sR~6eOKk4^Q z>X`i}!%aIN)>b{-eJyan`x=4eFoFX5e4B-LJtOyI@iR%Ql;O0}BG974JA2mkC6|@a zEuH!fwFNPn{Z7y80^3Bad%DQ}o`&Xtj9uH4(r^uUG^{gD?w4MQ0MdP-I50hc5+;C9 zs4BG(V)M-)^6$id5QZLjB#-vUFJ-d6UKq@D>}^jwId=)HeY8tHakJ)r5{XaIh^K9; zISVBJx!mJF4PZy8bynP8E-QtLvDC9Hjt{1}v*Z4*(^=z4_cb8x;V9?&&qG2!?c1!V z^~2M%k#Rc+_O;AMjkVpc@pXTjxl-cpTA!m{*Ci1n0~>M{FigJ9!hT=@!{o1~dP>Gn zwXM2%u$5LOf(y%L!!pzR{wfq68HTdUdPIQTn0mS2xvd0!V*~%r{`O)26<+XUh=`eDK$kB9g|A%nr*YR>3?-kRE9m{Nwv zuQaW=+oW#UwFcxYW^-cysf@E%BQzd3<$?fCui1qzKmx3Ps{|#ieDL4ZYmSI#;+YoA z0Pt_SvuJcJu!Jfeuhe}+9(0X7A|Rew?ehh|_GdpxA1WkMWa8}iWv5E&MaO?6Ei){_ z{~Auk7nLvu;%ZcqU7AAHQddV-kJXfjlZaP>8byeZsvh`*0TkYuG65JyKdp($k@;@a>b5tq+DVqII< z99!ES@e`vFzhmenIi~ky(gUqOg6FEi!>oC3IYE#D>?}^sMsh}R>3_6$o?%UG-M&{* zP!!OO0!kALqBKJww2&w$Yzv|yO-fWmK$>&{BvC<7sUlTML{OwgdQU{Ev{0l&0tARa zAdo;pNa3>g+56o0xZiW1``r87Z+Cr3^3Rxa%sJ*nL?)$WW5%?~twk=&JYIfB+A(f|#5X|B4IW6_-v}JuU%_?t zuItjT6AF^3JM*p1Ih@0lM-A@tG;4c<%Gqlv>6LEnGP{}@ld8{4jkN6l;%869_&qRd z!&4MuBn_~<2GuN56!l5y^?UgjWbOB(lDSzj<V{S-A~4}Bair=rleDt$T~Qr6$H6a-`AEZJ5U)DeSi`-n&Hoc-AjwVk|S0T?mZ%%vIB^;TWhQ@vY+>(6Uh7KG*BML$qj~IRGWhYkksy= zW;OJ4bdc;WZ=9=4+1Z%&%!Q1lPsn>+A=2-B@K1PdeVIlMj|^_9xLOQ@roA^@*~f$8 znM>zYtLA_SKOJM7)=Vx6)J(M|K}(lebAHx27iGOBeIp#q!&2hZoE(*uW{X{wB!WL3 z-5k$$3>INk%xk0Oc0u-_*JmtsInsl`PU>TY3?mXRBL zBM1^!-sSjm0ZLfzJ_?lw6hWW zP~Bp_+A`g%k^~R?f?`edlO?at46et@(~2j)gO9I>SkYK07t}U{oo$sPtIa4 zgCUdn7+GqN_9t6!=47ViYx@He;UW#7K@78v?9czRb{7C(C)*myNY3x@-p`O?pgPw@ z@TrVQywi|AKuptcY1c|aYP{c1xZ(`*{GjQ^;v~kQutW9Av`SBPTr^3*jokcETM}Sy1wMl3T6(_@W{3{M_zsGOrNLE_y0%x+ zp%0)s*xanZ50llp>&tdU8Rk}!0WC(3$#e#Ol(!IP=@&L6qP8|FR5kn%l8R|rRW z!YGe~$_7^x7UdPxWv3PMY%@fd_JI4imX_Slf&aFV7t6ZpJ5zqP}5ybG6Q*AOn)cQg2x=K>;57f(js^Uis^{DN%77<72{l5 zPji8=h24h4*32=h?nkI1oSJrkJM^~?M!?8 z5#&gsjH@-yKa3+S;(7?X_!_^xWdKt-h-f--&*+fkbLdm9#oP83WCr)a+KVA0S>U86 zjh?45lfq1wTqdxM9k8I54z|38MH1-=I9y9CignKyZtZw$*uL4%i}8tivmi zMteRro+M_t>ePt-c$C_H&xS=e1A_{0{s@7TKdT;4&fGuNQpUP|zSSu3^ive@qObfb z2(!PHax9v$dti#HxYJ}Zm87QJPWpk8Ij|^<9-6FJt)qB9Q)E)R|29rj%5_((`EmPra;dUyV0X<|on~Ivkbq1O&sBhZB7nSvw+g}z^C~|} z*Berv;nPd0Pw9Px8*p5l{e3BDNCpd8{u$Ex7+`VSKLf@|ktm>o&5b{FIShw=83){O zCEtvmXTiTxdyk*Yu*d2ZoE{D_5U?&1q8vI7`zhM-LHxCjklnkrG-tZD$Xez#_4hym zeA zaE`h=-_=>r{q}*5*3ls1JOJUeMy?Fwv~iLMe5KBQD~YK4HDgn|)h~Hn(qWj`n5;%k zI~V1d*8u>u$*>Q#9l{=FMiKeSa=|lr7e&lv3Olggtrz$c;KcGagxqKh(pZ_}o*}3{ zpwy4A)LH9q?+>5X9FY&K|2n+kuS40oIJ5y|24}n@%?0Wi;2%jVyV;*Q%wZGs z9WHo#kJ>9TZs?4v$4VDKIuR$UNPumWEZvOf+gLA_%vh`==9@~phmhQmChcWhAhc2^ z9`8_7G`_ochBfqLsLnl$D4jPX_wIB5N0$moFjL#`n{UTgHA;Z-fBgPWfiKU;r}^36R?dJaI3 zE-!EePnvoWzd{KjG`5kd#o$SlT{kteSEiq&wU4q7?|@1yoV{IumkKGU35N9Z>u7}! zX6tZlReV1@^K6oczHNmQzNpekcTzw?k7d{^W#WQCwO8;*In_>eq|3w}aR|;rq`CA> z!f)@Q;=qu_S2|hCR&R^s#@@N?$3BB?jVv1s5(u6$)KWO%f<>mLXj2T#e*73Zf=3=fQpGro`Hkdu!>9f@efRl=Eyt4<}j_8{1d=SWyk zx2GUZ^-)j@6WsE?A;>LyHh#4uxD@%d=ovXWYl+>#T?GeK|B%*$4aVF`0S{wVAb$M= zE~aBxWp`A|_r9e2HFdK2@jbNlfCIxtr5^1r3awA?79CjJMfDvmvc-uF+8~;0tKFi- z;qc1sU#|i-q5ZY*H*ztORXykXwBx?C{RnH-o>+Hr z3AhzmQS1KVfsNqW=Gyi7242?n?d3oGnZGAwIa(vz$f3MbDq-x{Yn?9cqzivc6j5{o zLgIo0E)+!IkAU=LA{umdA?#E0*3Co5u|Es|-3_a^qhU|1P=o11#xNNdH5`;oG-xKv zu%u6;hc{cnT3U6ftNE%P=m0b6KG4!%!z_lKc5OC^{!J~pPo|49xIC98Na#z7( z;g_{e#;IO-Fb6-(OS`&&3Upg6-so-z&7QPBfUo4KTiS+>%M?+{r^K)}=%N|q)Oa+rNsy=Uy|juoIa!{494W2hlDa4o z)V(fwdOfW9Gy+l3=*s4d?q)|KzbxJP?0%yb#0hMHzr7<4N=ImH6)cXDRxg9UEIB=? zrPyoa+6SAzhl;Y7OShVRHw12g-J?VLsf%DsC2XPN%0_k;sB*=S(G z!bS#`#FFHUqbErII0b9NtWGBUM0{^<)nR%-vPTm(8P(!Bsxu3R1Bl8-j^KDYArsUZ z9&eA^4;rZKx+gopel?j6UoSbEMQ$xqLYzPb6l|1|bBr@*80;#?Km8`F;9% zbaajA+_-)x!y>7yVK&DA$q-t5B&Fx>ok)`t=gi4G+=rBFRI|&;<(%qW>;)!D(mRPp zU%~O;KT~vltI)uB0jo(@jV<>RwCy#WOtxC?R7Kh4n=C!RKMy$1kfP|)svW4r5>#8o z$0M()3;CDOLpFr{pqDb}=4rg2hCUMXj%gXI88Ag}4z7sfy7a6#8fd$P*+6^5+kOL^ z+(h8cJhj;fRvfcI;&)=L9*IT^Q>1XvqVNw;;R$kp4Eru*u*o@ObQ*CUO48FpFS%5M zhiMu4))iuGJ%p9EtUcXjTC~|oZhy2x@iL2RZWS4M%THdcE;EvR5lyO-Bs7#N!99rC zE(BVum5qt7(!KpA%Zg;>H zwy!@dtAuo#HSi^#{5oVyrsV$YjhqbVPzZb$FAYF(y5q@{+b4-=>ZyDY66jf;b6 ziexgRs9u<9D+$(w*F6^Q2)#k|a=D`vcZM1y|Qc3@bj{q-PY%HalLh^5@DNaY?A8lP;l|wWK@Y zLviN@9H8MtLc#u-P|-=atjJ|7D+PuNhBfTRn=OY_s&XXbT*WuoC%L1c^s-J7%o&KZ z&}G&WaAa%U4bBDeYTGoILY=_iiY$uFofx#8VXZ%9YEwkt zg9EsIc|zj}1~u!Hbs&t=|?n~`TR z()M>`5d&KNUdMD94PSvs%@l-C#YYL)4*VCxhDS7;i!jf`2(n;|0QQWlahvbnMFsT9 z%X*QLEg$ungs}TD;W};>8AHdK^aG^VLD3CCifuwmt2+3ybMfKpV%|;EDBVkv_rZx)E+uOx{VP$jplMIBrR@C zDaxrddVaX7fJBS8#E|)^17mVq6oGEsH~({E=$+njLwd-V9Q@o;3KG44QFP3I)UYJBb6|Je6k;;Zj?;Znw0feT%7GSV330F*!ozpaDbE4m z=WM4e**_1*$?sXJ zqT|Q-1Ir1cqb@#oD%XOB8GE}cbdvT~NJ1+mgb;q6PL$F>vQ-}5&kEiwFFBU}9Jt4E z-}W_Bb`j}$=O*e+xxe}WO6aFNlKB4Uy>@a{kOPAxU004s_%mapbn@7GxjFd(iGHFM z+w}0&<|tKjU!aK)*598G#XweNM=>%Iv5oe5Mm_j zD`p3jQ2BcKN2Br*tmKo+7jK0mrj+^VnERy+piAnkNRBN5KPAwk=3_ITahUS)7K7O= zhQ70mI`2%OtugxzM)BjyNCd~?x_JUcGc>2ZuO%VQ?kK(W0go`Oa(m6=#aru+pX;4r z<@T(L(w*s#{c3HX3LCL&i<%!P`_3}&Um>bgT?^pYPR;I|qVLLNnV5|H)~7!q+_}4W6q1ms!_#jMr_`LTsoab zNjiz|YQBS>IMkZ?c1axBjpV4#V2@y+qU_%9c8Y2PwAYKFOxE z*+46X;MA5)HPj+N%Oa!RlIAsHaY;Ce;`{9_E%1DIz9LCNPXS*PKTj%9`9xgul8uBn2BLz)vdfYTaxo2(Jk7a>Gxs>A8g)+`@tx%?Ivvh z!EDqgsPnw{2@~$^3@rp{y1w1tD$P63UI&}Z)L{na)zB}#p|^hInRMuF;%ipkM@234 z;+02NqisZ7oAT|g2PpNc$Vps!ETGnS!y@egMP z@72V}rI?2}u~w&3G$uHct=Mb${{<)fy9tD7?(GQz=8;S|FSlqJGpl^TE~dBv(}aO#P7OElL~>U%9p`7xgS^1Ua=e z?X)p5uJy3GG%BKCSo!o|V1O4REW52@5*+k6z)XV6$_wU?d)ii$(4C%R) zH(R-{;%bWZM{d$lz2*q}^t>7W6Kf-vd*>n_B?)@G_I6#;mGQN9n=A>~db-N$OMdkH zAl1Q5ZR;8$y`#%tcRTHOApD>Bmi1i?VbbO}mpc)SH#{ zYrR;dx5a4nEsvaDD4x^HokQ6=AxB0M{li#qg9_3<>E4N<{FQukf5o=Jx(O8d*jkN0 zAsKmolhisBXl28sz9BQer`24zZRHG8{x_!Qd~eDp|Q!{QgwN8YI(fxynW^c8;KQ;l02>PnnYgi!PM z@TCIeIeApeE`n;{%C&E#`rB0TPBE-D?LCs1JbYtaob83#?2D-jK0exel~G}d{MxA3 z8~qudkNrOUAP=;e{s>Kw-*{R^-~8SFi+S)@&XXmwb+&Nd{;Xg_d%f8y@efjL{j5q$ zi`jyTgYy^6&5FPC!KPH`Z{9p7)b6R=5A%pF$u+{UEJEe`L_OV{;)cGFLDjF=ger$KsDl#4mBBDIfV=tTbqaXE#^Cz4PSnT6zHUJpzbntX8w;0sh>UrO%VrKI#Ry$#b%tula6jqVkI%y&ku+3*&Wl1g z)08jeyQw)bFTi#{50`8tu1Q)h=2byijbk`C%&)w+UZ*HZ(@1;ZdN<#M?tMQ*EVK#- zNt7)0X=*=#`hu^5ucnx^Hr?hT_F5Gi^k@J$q6tu*SYeIF;z*Q;JpvdOvS z)tP}Q35y)ggcZQgyvd|l4T2hPoZ0&>p+wDK!R^#hk-_IfO;}Y+sNr=nx_rJ>zHZzs zpzi+s=Z|gq&ofLKj}HdQN4?p6IQALpz?3u(iS7lK7|v8ruKLPcny zYbi!OXuq0+MCTO3acND~awV9kiqgBvv@bgFFxu;6=xLh2Z}s*q3cMxNc$9|?HSds! z5>~Z$_`BIL=$l2fX2BVY_l84Drd0>dDMry_pj|7D=!|ndm`_2V2~&5C4FBQH(!Wx> zdUztQ``--w?0=dQ#6iYCRml@3K5}_CeXhZizPL~|moqSfcbg@I9XEnl#McJ8f^`nj~ zW%_&@y#q7lTCSIZM7gsDMFJ4o*nto|kp~v-Y`@d)>M`A8|p zOl~Wfniq>sP5veC%T>fWLGRw_7Am!Lt)aPo;c8T&sZ~9)XnCsfp4%GQ-aXL5BYKO#r=b+!G01HV}EN|=5Ro+(n^p;PZTP*-0NG* zV?scaWAiHfNqt5s*;@`;yT=J?kr3Hs$Jg!qbNGVlr|W}?N**tIlZDo_*CqH+w&SJE zrTiC3ax2yXAh`J<{?!dL&?m(%zxg0;;z$>~7F|&{Fmi7z2RsjdYUYRXm-xvV_#~=W zYSh$?G+N-j`oVS&#t_sFQNxg!`qS3@ItTG2_vf`GRxe+$J=CQAgi;5?29iI0;0a0D!-Cj}7 zTupDrV@YZFa~FR4?kp31P2Rlwf`-!28ao-hz8m3Ody@N_taZBZik$|#%m#I`-NEo- z+1(-%OA$_ZLrCF1Ms7_|E8C$Rl(@9Zu{WEN)HqlFC);*mLSW+?G;q`@HURTnya!O_0%M2rw`G zce=;)@W995lWg5;9=3=5w%OBpZ@nR-Qr}<^WmtqeM87+t`2r=KDiQ(FJB1kZ^ll6TY@st!4=sVw4rvM_lyy-A9R_~ zHN0i?L8^9S>!2PtjKYMf(pWdjUaSUJ{aCt+ZNFdr!`T6)fZ$ng??o1(5&;~)NKe-& zZt$fA$Zupu^{Q06_->SfI@0zh&wBeahP!W`Zty__CI*R@U1FY@-PHTQ2TP=hT+@wd zVxTF=S))CWGlJ-gnIJniNcRDT8;=w@{(M9s4d)WNeXHE6BUnsdCW+EEtdXU+QPP<} zeHA&t{~62kE>q-_W}Bzcmn?6rFGQ?LU0Ej2v&pq21ka9pX>D;@Cy{&o6WRMm(h%+O zmJeYUjFdjqJh=-WJJg%X4a#^sR9c`RyWJoMILl=k2eSWpv3Su>=#!^00R(Bigft9m>G$t7l&_e`hCSpM6XO{hNuOn(o@h zRtkOE)4Z~|s82rTyv6mD?t?@<^f<^VSy>q!#6Z|Eb zjtY>c?Bspy65S4}F9lv>P+M>RKIA=UN&mZfXIXHklr$1FLB>*h`|v(~ z%wC8bo2I%qmKT&+xRLim_WhC7mp$C9--DjwuUNw(mj+Im*}Y2yJs?o`VdX*A_~~G? zK9BZ#2>ad2Zm|3s?m@uyhZbR&?>f7|$L7^fzYDft>c%0JM&FvE^MfkmgVeDmGNRf8GuU*5`Dm@? zT;+?Hs|>hRD)jUc&vT|+k!uQ(JV7*+N^PCCIwpdPqju2>6euLF)9A-xi5(du_)i0S6g zJmI`I8`$?L^%qB_3sk-17Hn>i7oHR?yPU|`lBco0WTQ^oufZOId3PQ%98b_yU!`s3 z&!DcfBW(MiQH~wm2_;e6_|F?|R)5YSZoamKI6qPD^F~_XYUNQR_yDIxR)ob2-3orM z=e=)*^QxWuejO^qPfTbdj^tIHKg?8Z!+pqBLJ=$Uak8r_4c3L@5#PRw{MdM!A*c1Y z2cAp4COis62%v6Aq#>#XXF8~@>nwnxH#VXAn^U@nS}RlaT$hD4v@8RelZ7>H^}my0 z??#Z|=j9>=vNta-(5GviR&7%sixwL-P&=mBJd2#(ndEyLvBki(M@j4O*=SdZC668{{__?q=&lB zFnx+`30JK;J3};zf&4Dcgfbbv6Sr2M= zlD#%6v5p3-Of7)_58+AwBe(d2%$pRhLJdS|dhfQ+POG!LG07xI(VW&8kmh^=K0~rq z&hP+$HD?x2wM_2`6Y|6L7kReqEha`3nrJB=s1?-K@0Mr!f9w^&M&UEjI`5F?EvnfXNtTlM)x2rV!#?CprH;)7$$HSW#d zDMCZGj3;dwzDwVqsEs2g&M=p{DiB`G6q*Nln@`0Cu__%(Ib&59d4<-BTXREgI5Wjk zH*~kTV}413(V%qo>Pk-O9%tVuTMT~C!DT7Xv;r`8Z^*h-dE`+o;f))0?AwUzxY>?l6AetM*s^4S)aD2-PsUsSx@v<=BH z8v&L&HZ5+645|xb|H+9{`vaTzH(&4ngpp`$s(jPP!^g?l{{Iam5qSPjKHdK|B=Ii} zB0vzJbsqEw0P&AU&A&N`w*Li?_-Eb!?EL={kod3t{kMcffEqwh82|(cYHNc8wYCwB z{{l$de)T^Ai2(I&^x=OCNCa(z7XK|G@lUefpTNGq5Q#vwzlIRFO=J991OBC{Y5zfG z{9FAShzZmJ{->t#w4`?+K)~-Q~ai{0(gc zXlrk`?l1LUiG=?#({A2(bnq5bz2^MTdwYQXaC!TWXc>6;ZUZC%|3b11@N#%0c>efy zz_uxu|8b}t2We;kHI6?z{-*}eP*(??KmO-?$KC%!`|}0xFYPb8|0w)V4FLK_S-`)k zPYnq8haFH`>lY0K&;+RcLjwV{|Ea0}+GY>{^bfz=B>}(s1*x6a(D=1JHMR5V|ImOM zzxvQt)71JqE`P@gq^7Q|`Ab=CAn^P@+N`F1{uevI_H@wxwLa}_x4*;`0005ier=yN zP#g41KLFcqLBIIjE~}yTYkfc>U0FV~&S33=$mijO8+xDTM_N%6$eqQ^R_HEl~ z0e|V+_FPlb{$+f(HQ=vf{)YzqwZ3glFQP0l&<(KQ!RKXg*%| zw?V32f?)9W>fs#dAo#~3bKAqiNAQnx|MvER`, used to + select the desired overloaded function (see next section). Use aliases such + as `ScalableTag` instead of referring to this type directly; +* `d` is an lvalue of type `D`, passed as a function argument e.g. to Zero; +* `V` is the type of a vector, which may be a class or built-in type. + +## Vector and tag types + +Highway vectors consist of one or more 'lanes' of the same built-in type +`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32, +64` and `bfloat16_t`. + +Beware that `char` may differ from these types, and is not supported directly. +If your code loads from/stores to `char*`, use `T=uint8_t` for Highway's `d` +tags (see below) or `T=int8_t` (which may enable faster less-than/greater-than +comparisons), and cast your `char*` pointers to your `T*`. + +In Highway, `float16_t` (an IEEE binary16 half-float) and `bfloat16_t` (the +upper 16 bits of an IEEE binary32 float) only support load, store, and +conversion to/from `float32_t`. The behavior of infinity and NaN in `float16_t` +is implementation-defined due to ARMv7. + +On RVV/SVE, vectors are sizeless and cannot be wrapped inside a class. The +Highway API allows using built-in types as vectors because operations are +expressed as overloaded functions. Instead of constructors, overloaded +initialization functions such as `Set` take a zero-sized tag argument called `d` +of type `D` and return an actual vector of unspecified type. + +`T` is one of the lane types above, and may be retrieved via `TFromD`. + +The actual lane count (used to increment loop counters etc.) can be obtained via +`Lanes(d)`. This value might not be known at compile time, thus storage for +vectors should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`. + +Note that `Lanes(d)` could potentially change at runtime. This is currently +unlikely, and will not be initiated by Highway without user action, but could +still happen in other circumstances: + +* upon user request in future via special CPU instructions (switching to + 'streaming SVE' mode for Arm SME), or +* via system software (`prctl(PR_SVE_SET_VL` on Linux for Arm SVE). When the + vector length is changed using this mechanism, all but the lower 128 bits of + vector registers are invalidated. + +Thus we discourage caching the result; it is typically used inside a function or +basic block. If the application anticipates that one of the above circumstances +could happen, it should ensure by some out-of-band mechanism that such changes +will not happen during the critical section (the vector code which uses the +result of the previously obtained `Lanes(d)`). + +`MaxLanes(d)` returns a (potentially loose) upper bound on `Lanes(d)`, and is +implemented as a constexpr function. + +The actual lane count is guaranteed to be a power of two, even on SVE hardware +where vectors can be a multiple of 128 bits (there, the extra lanes remain +unused). This simplifies alignment: remainders can be computed as `count & +(Lanes(d) - 1)` instead of an expensive modulo. It also ensures loop trip counts +that are a large power of two (at least `MaxLanes`) are evenly divisible by the +lane count, thus avoiding the need for a second loop to handle remainders. + +`d` lvalues (a tag, NOT actual vector) are obtained using aliases: + +* Most common: `ScalableTag d;` or the macro form `HWY_FULL(T[, + LMUL=1]) d;`. With the default value of the second argument, these both + select full vectors which utilize all available lanes. + + Only for targets (e.g. RVV) that support register groups, the kPow2 (-3..3) + and LMUL argument (1, 2, 4, 8) specify `LMUL`, the number of registers in + the group. This effectively multiplies the lane count in each operation by + `LMUL`, or left-shifts by `kPow2` (negative values are understood as + right-shifting by the absolute value). These arguments will eventually be + optional hints that may improve performance on 1-2 wide machines (at the + cost of reducing the effective number of registers), but RVV target does not + yet support fractional `LMUL`. Thus, mixed-precision code (e.g. demoting + float to uint8_t) currently requires `LMUL` to be at least the ratio of the + sizes of the largest and smallest type, and smaller `d` to be obtained via + `Half`. + +* Less common: `CappedTag d` or the macro form `HWY_CAPPED(T, kCap) + d;`. These select vectors or masks where *no more than* the largest power of + two not exceeding `kCap` lanes have observable effects such as + loading/storing to memory, or being counted by `CountTrue`. The number of + lanes may also be less; for the `HWY_SCALAR` target, vectors always have a + single lane. For example, `CappedTag` will use up to two lanes. + +* For applications that require fixed-size vectors: `FixedTag d;` + will select vectors where exactly `kCount` lanes have observable effects. + These may be implemented using full vectors plus additional runtime cost for + masking in `Load` etc. `kCount` must be a power of two not exceeding + `HWY_LANES(T)`, which is one for `HWY_SCALAR`. This tag can be used when the + `HWY_SCALAR` target is anyway disabled (superseded by a higher baseline) or + unusable (due to use of ops such as `TableLookupBytes`). As a convenience, + we also provide `Full128`, `Full64` and `Full32` aliases which are + equivalent to `FixedTag`, `FixedTag` + and `FixedTag`. + +* The result of `UpperHalf`/`LowerHalf` has half the lanes. To obtain a + corresponding `d`, use `Half`; the opposite is `Twice<>`. + +User-specified lane counts or tuples of vectors could cause spills on targets +with fewer or smaller vectors. By contrast, Highway encourages vector-length +agnostic code, which is more performance-portable. + +For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for +the smaller types must be obtained from those of the larger type (e.g. via +`Rebind>`). + +## Using unspecified vector types + +Vector types are unspecified and depend on the target. User code could define +them as `auto`, but it is more readable (due to making the type visible) to use +an alias such as `Vec`, or `decltype(Zero(d))`. Similarly, the mask type can +be obtained via `Mask`. + +Vectors are sizeless types on RVV/SVE. Therefore, vectors must not be used in +arrays/STL containers (use the lane type `T` instead), class members, +static/thread_local variables, new-expressions (use `AllocateAligned` instead), +and sizeof/pointer arithmetic (increment `T*` by `Lanes(d)` instead). + +Initializing constants requires a tag type `D`, or an lvalue `d` of that type. +The `D` can be passed as a template argument or obtained from a vector type `V` +via `DFromV`. `TFromV` is equivalent to `TFromD>`. + +**Note**: Let `DV = DFromV`. For builtin `V` (currently necessary on +RVV/SVE), `DV` might not be the same as the `D` used to create `V`. In +particular, `DV` must not be passed to `Load/Store` functions because it may +lack the limit on `N` established by the original `D`. However, `Vec` is the +same as `V`. + +Thus a template argument `V` suffices for generic functions that do not load +from/store to memory: `template V Mul4(V v) { return v * +Set(DFromV(), 4); }`. + +Example of mixing partial vectors with generic functions: + +``` +CappedTag d2; +auto v = Mul4(Set(d2, 2)); +Store(v, d2, ptr); // Use d2, NOT DFromV() +``` + +## Targets + +Let `Target` denote an instruction set, one of +`SCALAR/EMU128/SSSE3/SSE4/AVX2/AVX3/AVX3_DL/NEON/SVE/SVE2/WASM/RVV`. Each of +these is represented by a `HWY_Target` (for example, `HWY_SSE4`) macro which +expands to a unique power-of-two value. + +Note that x86 CPUs are segmented into dozens of feature flags and capabilities, +which are often used together because they were introduced in the same CPU +(example: AVX2 and FMA). To keep the number of targets and thus compile time and +code size manageable, we define targets as 'clusters' of related features. To +use `HWY_AVX2`, it is therefore insufficient to pass -mavx2. For definitions of +the clusters, see `kGroup*` in `targets.cc`. The corresponding Clang/GCC +compiler options to enable them (without -m prefix) are defined by +`HWY_TARGET_STR*` in `set_macros-inl.h`. + +Targets are only used if enabled (i.e. not broken nor disabled). Baseline +targets are those for which the compiler is unconditionally allowed to generate +instructions (implying the target CPU must support them). + +* `HWY_STATIC_TARGET` is the best enabled baseline `HWY_Target`, and matches + `HWY_TARGET` in static dispatch mode. This is useful even in dynamic + dispatch mode for deducing and printing the compiler flags. + +* `HWY_TARGETS` indicates which targets to generate for dynamic dispatch, and + which headers to include. It is determined by configuration macros and + always includes `HWY_STATIC_TARGET`. + +* `HWY_SUPPORTED_TARGETS` is the set of targets available at runtime. Expands + to a literal if only a single target is enabled, or SupportedTargets(). + +* `HWY_TARGET`: which `HWY_Target` is currently being compiled. This is + initially identical to `HWY_STATIC_TARGET` and remains so in static dispatch + mode. For dynamic dispatch, this changes before each re-inclusion and + finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to + provide an alternative to functions which are not supported by `HWY_SCALAR`. + + In particular, for x86 we sometimes wish to specialize functions for AVX-512 + because it provides many new instructions. This can be accomplished via `#if + HWY_TARGET <= HWY_AVX3`, which means AVX-512 or better (e.g. `HWY_AVX3_DL`). + This is because numerically lower targets are better, and no other platform + has targets numerically less than those of x86. + +* `HWY_WANT_SSSE3`, `HWY_WANT_SSE4`: add SSSE3 and SSE4 to the baseline even + if they are not marked as available by the compiler. On MSVC, the only ways + to enable SSSE3 and SSE4 are defining these, or enabling AVX. + +* `HWY_WANT_AVX3_DL`: opt-in for dynamic dispatch to `HWY_AVX3_DL`. This is + unnecessary if the baseline already includes AVX3_DL. + +## Operations + +In the following, the argument or return type `V` denotes a vector with `N` +lanes, and `M` a mask. Operations limited to certain vector types begin with a +constraint of the form `V`: `{prefixes}[{bits}]`. The prefixes `u,i,f` denote +unsigned, signed, and floating-point types, and bits indicates the number of +bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and +bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used. + +Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined +functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions +generally take either a `D` or vector/mask argument. For targets where vectors +and masks are defined in namespace `hwy`, the functions will be found via +Argument-Dependent Lookup. However, this does not work for function templates, +and RVV and SVE both use builtin vectors. There are three options for portable +code, in descending order of preference: + +- `namespace hn = hwy::HWY_NAMESPACE;` alias used to prefix ops, e.g. + `hn::LoadDup128(..)`; +- `using hwy::HWY_NAMESPACE::LoadDup128;` declarations for each op used; +- `using hwy::HWY_NAMESPACE;` directive. This is generally discouraged, + especially for SIMD code residing in a header. + +Note that overloaded operators are not yet supported on RVV and SVE. Until that +is resolved, code that wishes to run on all targets must use the corresponding +equivalents mentioned in the description of each overloaded operator, for +example `Lt` instead of `operator<`. + +### Initialization + +* V **Zero**(D): returns N-lane vector with all bits set to 0. +* V **Set**(D, T): returns N-lane vector with all lanes equal to + the given value of type `T`. +* V **Undefined**(D): returns uninitialized N-lane vector, e.g. + for use as an output parameter. +* V **Iota**(D, T): returns N-lane vector where the lane with + index `i` has the given value of type `T` plus `i`. The least significant + lane has index 0. This is useful in tests for detecting lane-crossing bugs. +* V **SignBit**(D, T): returns N-lane vector with all lanes set + to a value whose representation has only the most-significant bit set. + +### Getting/setting lanes + +* T **GetLane**(V): returns lane 0 within `V`. This is useful for + extracting `SumOfLanes` results. + +The following may be slow on some platforms (e.g. x86) and should not be used in +time-critical code: + +* T **ExtractLane**(V, size_t i): returns lane `i` within `V`. + `i` must be in `[0, Lanes(DFromV()))`. Potentially slow, it may be better + to store an entire vector to an array and then operate on its elements. + +* V **InsertLane**(V, size_t i, T t): returns a copy of V whose + lane `i` is set to `t`. `i` must be in `[0, Lanes(DFromV()))`. + Potentially slow, it may be better set all elements of an aligned array and + then `Load` it. + +### Printing + +* V **Print**(D, const char* caption, V [, size_t lane][, size_t + max_lanes]): prints `caption` followed by up to `max_lanes` + comma-separated lanes from the vector argument, starting at index `lane`. + Defined in hwy/print-inl.h, also available if hwy/tests/test_util-inl.h has + been included. + +### Arithmetic + +* V **operator+**(V a, V b): returns `a[i] + b[i]` (mod 2^bits). + Currently unavailable on SVE/RVV; use the equivalent `Add` instead. +* V **operator-**(V a, V b): returns `a[i] - b[i]` (mod 2^bits). + Currently unavailable on SVE/RVV; use the equivalent `Sub` instead. + +* `V`: `{i,f}` \ + V **Neg**(V a): returns `-a[i]`. + +* `V`: `{i,f}` \ + V **Abs**(V a) returns the absolute value of `a[i]`; for + integers, `LimitsMin()` maps to `LimitsMax() + 1`. + +* `V`: `f32` \ + V **AbsDiff**(V a, V b): returns `|a[i] - b[i]|` in each lane. + +* `V`: `u8` \ + VU64 **SumsOf8**(V v) returns the sums of 8 consecutive u8 + lanes, zero-extending each sum into a u64 lane. This is slower on RVV/WASM. + +* `V`: `{u,i}{8,16}` \ + V **SaturatedAdd**(V a, V b) returns `a[i] + b[i]` saturated to + the minimum/maximum representable value. + +* `V`: `{u,i}{8,16}` \ + V **SaturatedSub**(V a, V b) returns `a[i] - b[i]` saturated to + the minimum/maximum representable value. + +* `V`: `{u}{8,16}` \ + V **AverageRound**(V a, V b) returns `(a[i] + b[i] + 1) / 2`. + +* V **Clamp**(V a, V lo, V hi): returns `a[i]` clamped to + `[lo[i], hi[i]]`. + +* `V`: `{f}` \ + V **operator/**(V a, V b): returns `a[i] / b[i]` in each lane. + Currently unavailable on SVE/RVV; use the equivalent `Div` instead. + +* `V`: `{f}` \ + V **Sqrt**(V a): returns `sqrt(a[i])`. + +* `V`: `f32` \ + V **ApproximateReciprocalSqrt**(V a): returns an approximation + of `1.0 / sqrt(a[i])`. `sqrt(a) ~= ApproximateReciprocalSqrt(a) * a`. x86 + and PPC provide 12-bit approximations but the error on ARM is closer to 1%. + +* `V`: `f32` \ + V **ApproximateReciprocal**(V a): returns an approximation of + `1.0 / a[i]`. + +#### Min/Max + +**Note**: Min/Max corner cases are target-specific and may change. If either +argument is qNaN, x86 SIMD returns the second argument, ARMv7 Neon returns NaN, +Wasm is supposed to return NaN but does not always, but other targets actually +uphold IEEE 754-2019 minimumNumber: returning the other argument if exactly one +is qNaN, and NaN if both are. + +* V **Min**(V a, V b): returns `min(a[i], b[i])`. + +* V **Max**(V a, V b): returns `max(a[i], b[i])`. + +All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* `V`: `u64` \ + M **Min128**(D, V a, V b): returns the minimum of unsigned + 128-bit values, each stored as an adjacent pair of 64-bit lanes (e.g. + indices 1 and 0, where 0 is the least-significant 64-bits). + +* `V`: `u64` \ + M **Max128**(D, V a, V b): returns the maximum of unsigned + 128-bit values, each stored as an adjacent pair of 64-bit lanes (e.g. + indices 1 and 0, where 0 is the least-significant 64-bits). + +* `V`: `u64` \ + M **Min128Upper**(D, V a, V b): for each 128-bit key-value + pair, returns `a` if it is considered less than `b` by Lt128Upper, else `b`. + +* `V`: `u64` \ + M **Max128Upper**(D, V a, V b): for each 128-bit key-value + pair, returns `a` if it is considered > `b` by Lt128Upper, else `b`. + +#### Multiply + +* `V`: `{u,i}{16,32,64}` \ + V operator*(V a, V b): returns the lower half of `a[i] * + b[i]` in each lane. Currently unavailable on SVE/RVV; use the equivalent + `Mul` instead. + +* `V`: `{f}` \ + V operator*(V a, V b): returns `a[i] * b[i]` in each + lane. Currently unavailable on SVE/RVV; use the equivalent `Mul` instead. + +* `V`: `i16` \ + V **MulHigh**(V a, V b): returns the upper half of `a[i] * + b[i]` in each lane. + +* `V`: `i16` \ + V **MulFixedPoint15**(V a, V b): returns the result of + multiplying two 1.15 fixed-point numbers. This corresponds to doubling the + multiplication result and storing the upper half. Results are + implementation-defined iff both inputs are -32768. + +* `V`: `{u,i}{32},u64` \ + V2 **MulEven**(V a, V b): returns double-wide result of `a[i] * + b[i]` for every even `i`, in lanes `i` (lower) and `i + 1` (upper). `V2` is + a vector with double-width lanes, or the same as `V` for 64-bit inputs + (which are only supported if `HWY_TARGET != HWY_SCALAR`). + +* `V`: `u64` \ + V **MulOdd**(V a, V b): returns double-wide result of `a[i] * + b[i]` for every odd `i`, in lanes `i - 1` (lower) and `i` (upper). Only + supported if `HWY_TARGET != HWY_SCALAR`. + +* `V`: `{bf,i}16`, `D`: `RepartitionToWide>` \ + Vec **ReorderWidenMulAccumulate**(D d, V a, V b, Vec sum0, + Vec& sum1): widens `a` and `b` to `TFromD`, then adds `a[i] * + b[i]` to either `sum1[j]` or lane `j` of the return value, where `j = P(i)` + and `P` is a permutation. The only guarantee is that `SumOfLanes(d, + Add(return_value, sum1))` is the sum of all `a[i] * b[i]`. This is useful + for computing dot products and the L2 norm. + +#### Fused multiply-add + +When implemented using special instructions, these functions are more precise +and faster than separate multiplication followed by addition. The `*Sub` +variants are somewhat slower on ARM; it is preferable to replace them with +`MulAdd` using a negated constant. + +* `V`: `{f}` \ + V **MulAdd**(V a, V b, V c): returns `a[i] * b[i] + c[i]`. + +* `V`: `{f}` \ + V **NegMulAdd**(V a, V b, V c): returns `-a[i] * b[i] + c[i]`. + +* `V`: `{f}` \ + V **MulSub**(V a, V b, V c): returns `a[i] * b[i] - c[i]`. + +* `V`: `{f}` \ + V **NegMulSub**(V a, V b, V c): returns `-a[i] * b[i] - c[i]`. + +#### Shifts + +**Note**: Counts not in `[0, sizeof(T)*8)` yield implementation-defined results. +Left-shifting signed `T` and right-shifting positive signed `T` is the same as +shifting `MakeUnsigned` and casting to `T`. Right-shifting negative signed +`T` is the same as an unsigned shift, except that 1-bits are shifted in. + +Compile-time constant shifts: the amount must be in [0, sizeof(T)*8). Generally +the most efficient variant, but 8-bit shifts are potentially slower than other +lane sizes, and `RotateRight` is often emulated with shifts: + +* `V`: `{u,i}` \ + V **ShiftLeft**<int>(V a) returns `a[i] << int`. + +* `V`: `{u,i}` \ + V **ShiftRight**<int>(V a) returns `a[i] >> int`. + +* `V`: `{u}{32,64}` \ + V **RotateRight**<int>(V a) returns `(a[i] >> int) | + (a[i] << (sizeof(T)*8 - int))`. + +Shift all lanes by the same (not necessarily compile-time constant) amount: + +* `V`: `{u,i}` \ + V **ShiftLeftSame**(V a, int bits) returns `a[i] << bits`. + +* `V`: `{u,i}` \ + V **ShiftRightSame**(V a, int bits) returns `a[i] >> bits`. + +Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2): + +* `V`: `{u,i}{16,32,64}` \ + V **operator<<**(V a, V b) returns `a[i] << b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Shl` instead. + +* `V`: `{u,i}{16,32,64}` \ + V **operator>>**(V a, V b) returns `a[i] >> b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Shr` instead. + +#### Floating-point rounding + +* `V`: `{f}` \ + V **Round**(V v): returns `v[i]` rounded towards the nearest + integer, with ties to even. + +* `V`: `{f}` \ + V **Trunc**(V v): returns `v[i]` rounded towards zero + (truncate). + +* `V`: `{f}` \ + V **Ceil**(V v): returns `v[i]` rounded towards positive + infinity (ceiling). + +* `V`: `{f}` \ + V **Floor**(V v): returns `v[i]` rounded towards negative + infinity. + +#### Floating-point classification + +* `V`: `{f}` \ + M **IsNaN**(V v): returns mask indicating whether `v[i]` is + "not a number" (unordered). + +* `V`: `{f}` \ + M **IsInf**(V v): returns mask indicating whether `v[i]` is + positive or negative infinity. + +* `V`: `{f}` \ + M **IsFinite**(V v): returns mask indicating whether `v[i]` is + neither NaN nor infinity, i.e. normal, subnormal or zero. Equivalent to + `Not(Or(IsNaN(v), IsInf(v)))`. + +### Logical + +* `V`: `{u,i}` \ + V **PopulationCount**(V a): returns the number of 1-bits in + each lane, i.e. `PopCount(a[i])`. + +The following operate on individual bits within each lane. Note that the +non-operator functions (`And` instead of `&`) must be used for floating-point +types, and on SVE/RVV. + +* `V`: `{u,i}` \ + V **operator&**(V a, V b): returns `a[i] & b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `And` instead. + +* `V`: `{u,i}` \ + V **operator|**(V a, V b): returns `a[i] | b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Or` instead. + +* `V`: `{u,i}` \ + V **operator^**(V a, V b): returns `a[i] ^ b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Xor` instead. + +* `V`: `{u,i}` \ + V **Not**(V v): returns `~v[i]`. + +* V **AndNot**(V a, V b): returns `~a[i] & b[i]`. + +The following three-argument functions may be more efficient than assembling +them from 2-argument functions: + +* V **Or3**(V o1, V o2, V o3): returns `o1[i] | o2[i] | o3[i]`. +* V **OrAnd**(V o, V a1, V a2): returns `o[i] | (a1[i] & a2[i])`. + +Special functions for signed types: + +* `V`: `{f}` \ + V **CopySign**(V a, V b): returns the number with the magnitude + of `a` and sign of `b`. + +* `V`: `{f}` \ + V **CopySignToAbs**(V a, V b): as above, but potentially + slightly more efficient; requires the first argument to be non-negative. + +* `V`: `i32/64` \ + V **BroadcastSignBit**(V a) returns `a[i] < 0 ? -1 : 0`. + +* `V`: `{f}` \ + V **ZeroIfNegative**(V v): returns `v[i] < 0 ? 0 : v[i]`. + +* `V`: `{i,f}` \ + V **IfNegativeThenElse**(V v, V yes, V no): returns `v[i] < 0 ? + yes[i] : no[i]`. This may be more efficient than `IfThenElse(Lt..)`. + +### Masks + +Let `M` denote a mask capable of storing a logical true/false for each lane (the +encoding depends on the platform). + +#### Creation + +* M **FirstN**(D, size_t N): returns mask with the first `N` + lanes (those with index `< N`) true. `N >= Lanes(D())` results in an + all-true mask. `N` must not exceed + `LimitsMax))>>()`. + Useful for implementing "masked" stores by loading `prev` followed by + `IfThenElse(FirstN(d, N), what_to_store, prev)`. + +* M **MaskFromVec**(V v): returns false in lane `i` if `v[i] == + 0`, or true if `v[i]` has all bits set. The result is + *implementation-defined* if `v[i]` is neither zero nor all bits set. + +* M **LoadMaskBits**(D, const uint8_t* p): returns a mask + indicating whether the i-th bit in the array is set. Loads bytes and bits in + ascending order of address and index. At least 8 bytes of `p` must be + readable, but only `(Lanes(D()) + 7) / 8` need be initialized. Any unused + bits (happens if `Lanes(D()) < 8`) are treated as if they were zero. + +#### Conversion + +* M1 **RebindMask**(D, M2 m): returns same mask bits as `m`, but + reinterpreted as a mask for lanes of type `TFromD`. `M1` and `M2` must + have the same number of lanes. + +* V **VecFromMask**(D, M m): returns 0 in lane `i` if `m[i] == + false`, otherwise all bits set. + +* size_t **StoreMaskBits**(D, M m, uint8_t* p): stores a bit + array indicating whether `m[i]` is true, in ascending order of `i`, filling + the bits of each byte from least to most significant, then proceeding to the + next byte. Returns the number of bytes written: `(Lanes(D()) + 7) / 8`. At + least 8 bytes of `p` must be writable. + +#### Testing + +* bool **AllTrue**(D, M m): returns whether all `m[i]` are true. + +* bool **AllFalse**(D, M m): returns whether all `m[i]` are + false. + +* size_t **CountTrue**(D, M m): returns how many of `m[i]` are + true [0, N]. This is typically more expensive than AllTrue/False. + +* intptr_t **FindFirstTrue**(D, M m): returns the index of the + first (i.e. lowest index) `m[i]` that is true, or -1 if none are. + +* size_t **FindKnownFirstTrue**(D, M m): returns the index of the + first (i.e. lowest index) `m[i]` that is true. Requires `!AllFalse(d, m)`, + otherwise results are undefined. This is typically more efficient than + `FindFirstTrue`. + +#### Ternary operator + +For `IfThen*`, masks must adhere to the invariant established by `MaskFromVec`: +false is zero, true has all bits set: + +* V **IfThenElse**(M mask, V yes, V no): returns `mask[i] ? + yes[i] : no[i]`. + +* V **IfThenElseZero**(M mask, V yes): returns `mask[i] ? + yes[i] : 0`. + +* V **IfThenZeroElse**(M mask, V no): returns `mask[i] ? 0 : + no[i]`. + +* V **IfVecThenElse**(V mask, V yes, V no): equivalent to and + possibly faster than `IfVecThenElse(MaskFromVec(mask), yes, no)`. The result + is *implementation-defined* if `mask[i]` is neither zero nor all bits set. + +#### Logical + +* M **Not**(M m): returns mask of elements indicating whether the + input mask element was false. + +* M **And**(M a, M b): returns mask of elements indicating + whether both input mask elements were true. + +* M **AndNot**(M not_a, M b): returns mask of elements indicating + whether `not_a` is false and `b` is true. + +* M **Or**(M a, M b): returns mask of elements indicating whether + either input mask element was true. + +* M **Xor**(M a, M b): returns mask of elements indicating + whether exactly one input mask element was true. + +* M **ExclusiveNeither**(M a, M b): returns mask of elements + indicating `a` is false and `b` is false. Undefined if both are true. We + choose not to provide NotOr/NotXor because x86 and SVE only define one of + these operations. This op is for situations where the inputs are known to be + mutually exclusive. + +#### Compress + +* `V`: `{u,i,f}{16,32,64}` \ + V **Compress**(V v, M m): returns `r` such that `r[n]` is + `v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true. + Compacts lanes whose mask is true into the lower lanes. For targets and lane + type `T` where `CompressIsPartition::value` is true, the upper lanes are + those whose mask is false (thus `Compress` corresponds to partitioning + according to the mask). Otherwise, the upper lanes are + implementation-defined. Slow with 16-bit lanes. Use this form when the input + is already a mask, e.g. returned by a comparison. + +* `V`: `{u,i,f}{16,32,64}` \ + V **CompressNot**(V v, M m): equivalent to `Compress(v, + Not(m))` but possibly faster if `CompressIsPartition::value` is true. + +* `V`: `u64` \ + V **CompressBlocksNot**(V v, M m): equivalent to + `CompressNot(v, m)` when `m` is structured as adjacent pairs (both true or + false), e.g. as returned by `Lt128`. This is a no-op for 128 bit vectors. + Unavailable if `HWY_TARGET == HWY_SCALAR`. + +* `V`: `{u,i,f}{16,32,64}` \ + size_t **CompressStore**(V v, M m, D d, T* p): writes lanes + whose mask `m` is true into `p`, starting from lane 0. Returns `CountTrue(d, + m)`, the number of valid lanes. May be implemented as `Compress` followed by + `StoreU`; lanes after the valid ones may still be overwritten! Slower for + 16-bit lanes. + +* `V`: `{u,i,f}{16,32,64}` \ + size_t **CompressBlendedStore**(V v, M m, D d, T* p): writes + only lanes whose mask `m` is true into `p`, starting from lane 0. Returns + `CountTrue(d, m)`, the number of lanes written. Does not modify subsequent + lanes, but there is no guarantee of atomicity because this may be + implemented as `Compress, LoadU, IfThenElse(FirstN), StoreU`. + +* `V`: `{u,i,f}{16,32,64}` \ + V **CompressBits**(V v, const uint8_t* HWY_RESTRICT bits): + Equivalent to, but often faster than `Compress(v, LoadMaskBits(d, bits))`. + `bits` is as specified for `LoadMaskBits`. If called multiple times, the + `bits` pointer passed to this function must also be marked `HWY_RESTRICT` to + avoid repeated work. Note that if the vector has less than 8 elements, + incrementing `bits` will not work as intended for packed bit arrays. As with + `Compress`, `CompressIsPartition` indicates the mask=false lanes are moved + to the upper lanes; this op is also slow for 16-bit lanes. + +* `V`: `{u,i,f}{16,32,64}` \ + size_t **CompressBitsStore**(V v, const uint8_t* HWY_RESTRICT bits, D + d, T* p): combination of `CompressStore` and `CompressBits`, see + remarks there. + +### Comparisons + +These return a mask (see above) indicating whether the condition is true. + +* M **operator==**(V a, V b): returns `a[i] == b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Eq` instead. +* M **operator!=**(V a, V b): returns `a[i] != b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Ne` instead. + +* M **operator<**(V a, V b): returns `a[i] < b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Lt` instead. + +* M **operator>**(V a, V b): returns `a[i] > b[i]`. Currently + unavailable on SVE/RVV; use the equivalent `Gt` instead. + +* `V`: `{f}` \ + M **operator<=**(V a, V b): returns `a[i] <= b[i]`. + Currently unavailable on SVE/RVV; use the equivalent `Le` instead. + +* `V`: `{f}` \ + M **operator>=**(V a, V b): returns `a[i] >= b[i]`. + Currently unavailable on SVE/RVV; use the equivalent `Ge` instead. + +* `V`: `{u,i}` \ + M **TestBit**(V v, V bit): returns `(v[i] & bit[i]) == bit[i]`. + `bit[i]` must have exactly one bit set. + +* `V`: `u64` \ + M **Lt128**(D, V a, V b): for each adjacent pair of 64-bit + lanes (e.g. indices 1,0), returns whether `a[1]:a[0]` concatenated to an + unsigned 128-bit integer (least significant bits in `a[0]`) is less than + `b[1]:b[0]`. For each pair, the mask lanes are either both true or both + false. Unavailable if `HWY_TARGET == HWY_SCALAR`. + +* `V`: `u64` \ + M **Lt128Upper**(D, V a, V b): for each adjacent pair of 64-bit + lanes (e.g. indices 1,0), returns whether `a[1]` is less than `b[1]`. For + each pair, the mask lanes are either both true or both false. This is useful + for comparing 64-bit keys alongside 64-bit values. Only available if + `HWY_TARGET != HWY_SCALAR`. + +* `V`: `u64` \ + M **Eq128**(D, V a, V b): for each adjacent pair of 64-bit + lanes (e.g. indices 1,0), returns whether `a[1]:a[0]` concatenated to an + unsigned 128-bit integer (least significant bits in `a[0]`) equals + `b[1]:b[0]`. For each pair, the mask lanes are either both true or both + false. Unavailable if `HWY_TARGET == HWY_SCALAR`. + +* `V`: `u64` \ + M **Ne128**(D, V a, V b): for each adjacent pair of 64-bit + lanes (e.g. indices 1,0), returns whether `a[1]:a[0]` concatenated to an + unsigned 128-bit integer (least significant bits in `a[0]`) differs from + `b[1]:b[0]`. For each pair, the mask lanes are either both true or both + false. Unavailable if `HWY_TARGET == HWY_SCALAR`. + +* `V`: `u64` \ + M **Eq128Upper**(D, V a, V b): for each adjacent pair of 64-bit + lanes (e.g. indices 1,0), returns whether `a[1]` equals `b[1]`. For each + pair, the mask lanes are either both true or both false. This is useful for + comparing 64-bit keys alongside 64-bit values. Only available if `HWY_TARGET + != HWY_SCALAR`. + +* `V`: `u64` \ + M **Ne128Upper**(D, V a, V b): for each adjacent pair of 64-bit + lanes (e.g. indices 1,0), returns whether `a[1]` differs from `b[1]`. For + each pair, the mask lanes are either both true or both false. This is useful + for comparing 64-bit keys alongside 64-bit values. Only available if + `HWY_TARGET != HWY_SCALAR`. + +### Memory + +Memory operands are little-endian, otherwise their order would depend on the +lane configuration. Pointers are the addresses of `N` consecutive `T` values, +either `aligned` (address is a multiple of the vector size) or possibly +unaligned (denoted `p`). + +Even unaligned addresses must still be a multiple of `sizeof(T)`, otherwise +`StoreU` may crash on some platforms (e.g. RVV and ARMv7). Note that C++ ensures +automatic (stack) and dynamically allocated (via `new` or `malloc`) variables of +type `T` are aligned to `sizeof(T)`, hence such addresses are suitable for +`StoreU`. However, casting pointers to `char*` and adding arbitrary offsets (not +a multiple of `sizeof(T)`) can violate this requirement. + +**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic +bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands +are aligned to the vector size. An unaligned access may require two load ports. + +#### Load + +* Vec<D> **Load**(D, const T* aligned): returns + `aligned[i]`. May fault if the pointer is not aligned to the vector size + (using aligned_allocator.h is safe). Using this whenever possible improves + codegen on SSSE3/SSE4: unlike `LoadU`, `Load` can be fused into a memory + operand, which reduces register pressure. + +Requires only *element-aligned* vectors (e.g. from malloc/std::vector, or +aligned memory at indices which are not a multiple of the vector length): + +* Vec<D> **LoadU**(D, const T* p): returns `p[i]`. + +* Vec<D> **LoadDup128**(D, const T* p): returns one 128-bit + block loaded from `p` and broadcasted into all 128-bit block\[s\]. This may + be faster than broadcasting single values, and is more convenient than + preparing constants for the actual vector length. Only available if + `HWY_TARGET != HWY_SCALAR`. + +* Vec<D> **MaskedLoad**(M mask, D, const T* p): returns + `p[i]` or zero if the `mask` governing element `i` is false. May fault even + where `mask` is false `#if HWY_MEM_OPS_MIGHT_FAULT`. If `p` is aligned, + faults cannot happen unless the entire vector is inaccessible. Equivalent + to, and potentially more efficient than, `IfThenElseZero(mask, Load(D(), + aligned))`. + +* void **LoadInterleaved2**(D, const T* p, Vec<D>& v0, + Vec<D>& v1): equivalent to `LoadU` into `v0, v1` followed + by shuffling, such that `v0[0] == p[0], v1[0] == p[1]`. + +* void **LoadInterleaved3**(D, const T* p, Vec<D>& v0, + Vec<D>& v1, Vec<D>& v2): as above, but for three + vectors (e.g. RGB samples). + +* void **LoadInterleaved4**(D, const T* p, Vec<D>& v0, + Vec<D>& v1, Vec<D>& v2, Vec<D>& v3): as + above, but for four vectors (e.g. RGBA). + +#### Scatter/Gather + +**Note**: Offsets/indices are of type `VI = Vec>` and need not +be unique. The results are implementation-defined if any are negative. + +**Note**: Where possible, applications should `Load/Store/TableLookup*` entire +vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form +`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] = +F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`. + +* `D`: `{u,i,f}{32,64}` \ + void **ScatterOffset**(Vec<D> v, D, const T* base, VI + offsets): stores `v[i]` to the base address plus *byte* `offsets[i]`. + +* `D`: `{u,i,f}{32,64}` \ + void **ScatterIndex**(Vec<D> v, D, const T* base, VI + indices): stores `v[i]` to `base[indices[i]]`. + +* `D`: `{u,i,f}{32,64}` \ + Vec<D> **GatherOffset**(D, const T* base, VI offsets): + returns elements of base selected by *byte* `offsets[i]`. + +* `D`: `{u,i,f}{32,64}` \ + Vec<D> **GatherIndex**(D, const T* base, VI indices): + returns vector of `base[indices[i]]`. + +#### Store + +* void **Store**(Vec<D> v, D, T* aligned): copies `v[i]` + into `aligned[i]`, which must be aligned to the vector size. Writes exactly + `N * sizeof(T)` bytes. + +* void **StoreU**(Vec<D> v, D, T* p): as `Store`, but the + alignment requirement is relaxed to element-aligned (multiple of + `sizeof(T)`). + +* void **BlendedStore**(Vec<D> v, M m, D d, T* p): as + `StoreU`, but only updates `p` where `m` is true. May fault even where + `mask` is false `#if HWY_MEM_OPS_MIGHT_FAULT`. If `p` is aligned, faults + cannot happen unless the entire vector is inaccessible. Equivalent to, and + potentially more efficient than, `StoreU(IfThenElse(m, v, LoadU(d, p)), d, + p)`. "Blended" indicates this may not be atomic; other threads must not + concurrently update `[p, p + Lanes(d))` without synchronization. + +* void **SafeFillN**(size_t num, T value, D d, T* HWY_RESTRICT + to): Sets `to[0, num)` to `value`. If `num` exceeds `Lanes(d)`, the + behavior is target-dependent (either filling all, or no more than one + vector). Potentially more efficient than a scalar loop, but will not fault, + unlike `BlendedStore`. No alignment requirement. Potentially non-atomic, + like `BlendedStore`. + +* void **SafeCopyN**(size_t num, D d, const T* HWY_RESTRICT from, T* + HWY_RESTRICT to): Copies `from[0, num)` to `to`. If `num` exceeds + `Lanes(d)`, the behavior is target-dependent (either copying all, or no more + than one vector). Potentially more efficient than a scalar loop, but will + not fault, unlike `BlendedStore`. No alignment requirement. Potentially + non-atomic, like `BlendedStore`. + +* void **StoreInterleaved2**(Vec<D> v0, Vec<D> v1, D, T* + p): equivalent to shuffling `v0, v1` followed by two `StoreU()`, such + that `p[0] == v0[0], p[1] == v1[0]`. + +* void **StoreInterleaved3**(Vec<D> v0, Vec<D> v1, + Vec<D> v2, D, T* p): as above, but for three vectors (e.g. RGB + samples). + +* void **StoreInterleaved4**(Vec<D> v0, Vec<D> v1, + Vec<D> v2, Vec<D> v3, D, T* p): as above, but for four + vectors (e.g. RGBA samples). + +### Cache control + +All functions except `Stream` are defined in cache_control.h. + +* void **Stream**(Vec<D> a, D d, const T* aligned): copies + `a[i]` into `aligned[i]` with non-temporal hint if available (useful for + write-only data; avoids cache pollution). May be implemented using a + CPU-internal buffer. To avoid partial flushes and unpredictable interactions + with atomics (for example, see Intel SDM Vol 4, Sec. 8.1.2.2), call this + consecutively for an entire cache line (typically 64 bytes, aligned to its + size). Each call may write a multiple of `HWY_STREAM_MULTIPLE` bytes, which + can exceed `Lanes(d) * sizeof(T)`. The new contents of `aligned` may not be + visible until `FlushStream` is called. + +* void **FlushStream**(): ensures values written by previous + `Stream` calls are visible on the current core. This is NOT sufficient for + synchronizing across cores; when `Stream` outputs are to be consumed by + other core(s), the producer must publish availability (e.g. via mutex or + atomic_flag) after `FlushStream`. + +* void **FlushCacheline**(const void* p): invalidates and flushes + the cache line containing "p", if possible. + +* void **Prefetch**(const T* p): optionally begins loading the + cache line containing "p" to reduce latency of subsequent actual loads. + +* void **Pause**(): when called inside a spin-loop, may reduce + power consumption. + +### Type conversion + +* Vec<D> **BitCast**(D, V): returns the bits of `V` + reinterpreted as type `Vec`. + +* `V`,`D`: (`u8,u16`), (`u16,u32`), (`u8,u32`), (`u32,u64`), (`u8,i16`), \ + (`u8,i32`), (`u16,i32`), (`i8,i16`), (`i8,i32`), (`i16,i32`), (`i32,i64`) + Vec<D> **PromoteTo**(D, V part): returns `part[i]` zero- + or sign-extended to the integer type `MakeWide`. + +* `V`,`D`: (`f16,f32`), (`bf16,f32`), (`f32,f64`) \ + Vec<D> **PromoteTo**(D, V part): returns `part[i]` + widened to the floating-point type `MakeWide`. + +* `V`,`D`: \ + Vec<D> **PromoteTo**(D, V part): returns `part[i]` + converted to 64-bit floating point. + +* `V`,`D`: (`bf16,f32`) Vec<D> **PromoteLowerTo**(D, V v): + returns `v[i]` widened to `MakeWide`, for i in `[0, Lanes(D()))`. Note + that `V` has twice as many lanes as `D` and the return value. + +* `V`,`D`: (`bf16,f32`) Vec<D> **PromoteUpperTo**(D, V v): + returns `v[i]` widened to `MakeWide`, for i in `[Lanes(D()), 2 * + Lanes(D()))`. Note that `V` has twice as many lanes as `D` and the return + value. + +* `V`,`V8`: (`u32,u8`) \ + V8 **U8FromU32**(V): special-case `u32` to `u8` conversion when + all lanes of `V` are already clamped to `[0, 256)`. + +* `D`,`V`: (`u64,u32`), (`u64,u16`), (`u64,u8`), (`u32,u16`), (`u32,u8`), \ + (`u16,u8`) Vec<D> **TruncateTo**(D, V v): returns `v[i]` + truncated to the smaller type indicated by `T = TFromD`, with the same + result as if the more-signficant input bits that do not fit in `T` had been + zero. Example: ``` +ScalableTag du32; +Rebind du8; +TruncateTo(du8, Set(du32, 0xF08F)) + ``` is the same as `Set(du8, 0x8F)`. + +`DemoteTo` and float-to-int `ConvertTo` return the closest representable value +if the input exceeds the destination range. + +* `V`,`D`: (`i16,i8`), (`i32,i8`), (`i32,i16`), (`i16,u8`), (`i32,u8`), + (`i32,u16`), (`f64,f32`) \ + Vec<D> **DemoteTo**(D, V a): returns `a[i]` after packing + with signed/unsigned saturation to `MakeNarrow`. + +* `V`,`D`: `f64,i32` \ + Vec<D> **DemoteTo**(D, V a): rounds floating point + towards zero and converts the value to 32-bit integers. + +* `V`,`D`: (`f32,f16`), (`f32,bf16`) \ + Vec<D> **DemoteTo**(D, V a): narrows float to half (for + bf16, it is unspecified whether this truncates or rounds). + +* `D`: `{bf,i}16`, `V`: `RepartitionToWide` \ + Vec<D> **ReorderDemote2To**(D, V a, V b): as above, but + converts two inputs, `D` and the output have twice as many lanes as `V`, and + the output order is some permutation of the inputs. Only available if + `HWY_TARGET != HWY_SCALAR`. + +* `V`,`D`: (`i32`,`f32`), (`i64`,`f64`) \ + Vec<D> **ConvertTo**(D, V): converts an integer value to + same-sized floating point. + +* `V`,`D`: (`f32`,`i32`), (`f64`,`i64`) \ + Vec<D> **ConvertTo**(D, V): rounds floating point towards + zero and converts the value to same-sized integer. + +* `V`: `f32`; `Ret`: `i32` \ + Ret **NearestInt**(V a): returns the integer nearest to `a[i]`; + results are undefined for NaN. + +### Combine + +* V2 **LowerHalf**([D, ] V): returns the lower half of the vector + `V`. The optional `D` (provided for consistency with `UpperHalf`) is + `Half>`. + +All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* V2 **UpperHalf**(D, V): returns upper half of the vector `V`, + where `D` is `Half>`. + +* V **ZeroExtendVector**(D, V2): returns vector whose `UpperHalf` + is zero and whose `LowerHalf` is the argument; `D` is `Twice>`. + +* V **Combine**(D, V2, V2): returns vector whose `UpperHalf` is + the first argument and whose `LowerHalf` is the second argument; `D` is + `Twice>`. + +**Note**: the following operations cross block boundaries, which is typically +more expensive on AVX2/AVX-512 than per-block operations. + +* V **ConcatLowerLower**(D, V hi, V lo): returns the + concatenation of the lower halves of `hi` and `lo` without splitting into + blocks. `D` is `DFromV`. + +* V **ConcatUpperUpper**(D, V hi, V lo): returns the + concatenation of the upper halves of `hi` and `lo` without splitting into + blocks. `D` is `DFromV`. + +* V **ConcatLowerUpper**(D, V hi, V lo): returns the inner half + of the concatenation of `hi` and `lo` without splitting into blocks. Useful + for swapping the two blocks in 256-bit vectors. `D` is `DFromV`. + +* V **ConcatUpperLower**(D, V hi, V lo): returns the outer + quarters of the concatenation of `hi` and `lo` without splitting into + blocks. Unlike the other variants, this does not incur a block-crossing + penalty on AVX2/3. `D` is `DFromV`. + +* V **ConcatOdd**(D, V hi, V lo): returns the concatenation of + the odd lanes of `hi` and the odd lanes of `lo`. + +* V **ConcatEven**(D, V hi, V lo): returns the concatenation of + the even lanes of `hi` and the even lanes of `lo`. + +### Blockwise + +**Note**: if vectors are larger than 128 bits, the following operations split +their operands into independently processed 128-bit *blocks*. + +* `V`: `{u,i}{16,32,64}, {f}` \ + V **Broadcast**<int i>(V): returns individual *blocks*, + each with lanes set to `input_block[i]`, `i = [0, 16/sizeof(T))`. + +All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* `V`: `{u,i}` \ + VI **TableLookupBytes**(V bytes, VI indices): returns + `bytes[indices[i]]`. Uses byte lanes regardless of the actual vector types. + Results are implementation-defined if `indices[i] < 0` or `indices[i] >= + HWY_MIN(Lanes(DFromV()), 16)`. `VI` are integers, possibly of a different + type than those in `V`. The number of lanes in `V` and `VI` may differ, e.g. + a full-length table vector loaded via `LoadDup128`, plus partial vector `VI` + of 4-bit indices. + +* `V`: `{u,i}` \ + VI **TableLookupBytesOr0**(V bytes, VI indices): returns + `bytes[indices[i]]`, or 0 if `indices[i] & 0x80`. Uses byte lanes regardless + of the actual vector types. Results are implementation-defined for + `indices[i] < 0` or in `[HWY_MIN(Lanes(DFromV()), 16), 0x80)`. The + zeroing behavior has zero cost on x86 and ARM. For vectors of >= 256 bytes + (can happen on SVE and RVV), this will set all lanes after the first 128 + to 0. `VI` are integers, possibly of a different type than those in `V`. The + number of lanes in `V` and `VI` may differ. + +#### Interleave + +Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* V **InterleaveLower**([D, ] V a, V b): returns *blocks* with + alternating lanes from the lower halves of `a` and `b` (`a[0]` in the + least-significant lane). The optional `D` (provided for consistency with + `InterleaveUpper`) is `DFromV`. + +* V **InterleaveUpper**(D, V a, V b): returns *blocks* with + alternating lanes from the upper halves of `a` and `b` (`a[N/2]` in the + least-significant lane). `D` is `DFromV`. + +#### Zip + +* `Ret`: `MakeWide`; `V`: `{u,i}{8,16,32}` \ + Ret **ZipLower**([DW, ] V a, V b): returns the same bits as + `InterleaveLower`, but repartitioned into double-width lanes (required in + order to use this operation with scalars). The optional `DW` (provided for + consistency with `ZipUpper`) is `RepartitionToWide>`. + +* `Ret`: `MakeWide`; `V`: `{u,i}{8,16,32}` \ + Ret **ZipUpper**(DW, V a, V b): returns the same bits as + `InterleaveUpper`, but repartitioned into double-width lanes (required in + order to use this operation with scalars). `DW` is + `RepartitionToWide>`. Only available if `HWY_TARGET != + HWY_SCALAR`. + +#### Shift + +Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* `V`: `{u,i}` \ + V **ShiftLeftBytes**<int>([D, ] V): returns the result of + shifting independent *blocks* left by `int` bytes \[1, 15\]. The optional + `D` (provided for consistency with `ShiftRightBytes`) is `DFromV`. + +* V **ShiftLeftLanes**<int>([D, ] V): returns the result of + shifting independent *blocks* left by `int` lanes. The optional `D` + (provided for consistency with `ShiftRightLanes`) is `DFromV`. + +* `V`: `{u,i}` \ + V **ShiftRightBytes**<int>(D, V): returns the result of + shifting independent *blocks* right by `int` bytes \[1, 15\], shifting in + zeros even for partial vectors. `D` is `DFromV`. + +* V **ShiftRightLanes**<int>(D, V): returns the result of + shifting independent *blocks* right by `int` lanes, shifting in zeros even + for partial vectors. `D` is `DFromV`. + +* `V`: `{u,i}` \ + V **CombineShiftRightBytes**<int>(D, V hi, V lo): returns + a vector of *blocks* each the result of shifting two concatenated *blocks* + `hi[i] || lo[i]` right by `int` bytes \[1, 16). `D` is `DFromV`. + +* V **CombineShiftRightLanes**<int>(D, V hi, V lo): returns + a vector of *blocks* each the result of shifting two concatenated *blocks* + `hi[i] || lo[i]` right by `int` lanes \[1, 16/sizeof(T)). `D` is + `DFromV`. + +#### Shuffle + +Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* `V`: `{u,i,f}{32}` \ + V **Shuffle1032**(V): returns *blocks* with 64-bit halves + swapped. + +* `V`: `{u,i,f}{32}` \ + V **Shuffle0321**(V): returns *blocks* rotated right (toward + the lower end) by 32 bits. + +* `V`: `{u,i,f}{32}` \ + V **Shuffle2103**(V): returns *blocks* rotated left (toward the + upper end) by 32 bits. + +The following are equivalent to `Reverse2` or `Reverse4`, which should be used +instead because they are more general: + +* `V`: `{u,i,f}{32}` \ + V **Shuffle2301**(V): returns *blocks* with 32-bit halves + swapped inside 64-bit halves. + +* `V`: `{u,i,f}{64}` \ + V **Shuffle01**(V): returns *blocks* with 64-bit halves + swapped. + +* `V`: `{u,i,f}{32}` \ + V **Shuffle0123**(V): returns *blocks* with lanes in reverse + order. + +### Swizzle + +* V **OddEven**(V a, V b): returns a vector whose odd lanes are + taken from `a` and the even lanes from `b`. + +* V **OddEvenBlocks**(V a, V b): returns a vector whose odd + blocks are taken from `a` and the even blocks from `b`. Returns `b` if the + vector has no more than one block (i.e. is 128 bits or scalar). + +* `V`: `{u,i,f}{32,64}` \ + V **DupEven**(V v): returns `r`, the result of copying even + lanes to the next higher-indexed lane. For each even lane index `i`, + `r[i] == v[i]` and `r[i + 1] == v[i]`. + +* V **ReverseBlocks**(V v): returns a vector with blocks in + reversed order. + +* `V`: `{u,i,f}{32,64}` \ + V **TableLookupLanes**(V a, unspecified) returns a vector + of `a[indices[i]]`, where `unspecified` is the return value of + `SetTableIndices(D, &indices[0])` or `IndicesFromVec`. The indices are + not limited to blocks, hence this is slower than `TableLookupBytes*` on + AVX2/AVX-512. Results are implementation-defined unless `0 <= indices[i] + < Lanes(D())`. `indices` are always integers, even if `V` is a + floating-point type. + +* `D`: `{u,i}{32,64}` \ + unspecified **IndicesFromVec**(D d, V idx) prepares for + `TableLookupLanes` with integer indices in `idx`, which must be the same bit + width as `TFromD` and in the range `[0, Lanes(d))`, but need not be + unique. + +* `D`: `{u,i}{32,64}` \ + unspecified **SetTableIndices**(D d, TI* idx) prepares for + `TableLookupLanes` by loading `Lanes(d)` integer indices from `idx`, which + must be in the range `[0, Lanes(d))` but need not be unique. The index type + `TI` must be an integer of the same size as `TFromD`. + +* `V`: `{u,i,f}{16,32,64}` \ + V **Reverse**(D, V a) returns a vector with lanes in reversed + order (`out[i] == a[Lanes(D()) - 1 - i]`). + +The following `ReverseN` must not be called if `Lanes(D()) < N`: + +* `V`: `{u,i,f}{16,32,64}` \ + V **Reverse2**(D, V a) returns a vector with each group of 2 + contiguous lanes in reversed order (`out[i] == a[i ^ 1]`). + +* `V`: `{u,i,f}{16,32,64}` \ + V **Reverse4**(D, V a) returns a vector with each group of 4 + contiguous lanes in reversed order (`out[i] == a[i ^ 3]`). + +* `V`: `{u,i,f}{16,32,64}` \ + V **Reverse8**(D, V a) returns a vector with each group of 8 + contiguous lanes in reversed order (`out[i] == a[i ^ 7]`). + +All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* `V`: `{u,i,f}{32,64}` \ + V **DupOdd**(V v): returns `r`, the result of copying odd lanes + to the previous lower-indexed lane. For each odd lane index `i`, `r[i] == + v[i]` and `r[i - 1] == v[i]`. + +* V **SwapAdjacentBlocks**(V v): returns a vector where blocks of + index `2*i` and `2*i+1` are swapped. Results are undefined for vectors with + less than two blocks; callers must first check that via `Lanes`. + +### Reductions + +**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is +broadcasted to all lanes. To obtain a scalar, you can call `GetLane`. + +Being a horizontal operation (across lanes of the same vector), these are slower +than normal SIMD operations and are typically used outside critical loops. + +* `V`: `{u,i,f}{32,64},{u,i}{16}` \ + V **SumOfLanes**(D, V v): returns the sum of all lanes in each + lane. + +* `V`: `{u,i,f}{32,64},{u,i}{16}` \ + V **MinOfLanes**(D, V v): returns the minimum-valued lane in + each lane. + +* `V`: `{u,i,f}{32,64},{u,i}{16}` \ + V **MaxOfLanes**(D, V v): returns the maximum-valued lane in + each lane. + +### Crypto + +Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: + +* `V`: `u8` \ + V **AESRound**(V state, V round_key): one round of AES + encryption: `MixColumns(SubBytes(ShiftRows(state))) ^ round_key`. This + matches x86 AES-NI. The latency is independent of the input values. + +* `V`: `u8` \ + V **AESLastRound**(V state, V round_key): the last round of AES + encryption: `SubBytes(ShiftRows(state)) ^ round_key`. This matches x86 + AES-NI. The latency is independent of the input values. + +* `V`: `u64` \ + V **CLMulLower**(V a, V b): carryless multiplication of the + lower 64 bits of each 128-bit block into a 128-bit product. The latency is + independent of the input values (assuming that is true of normal integer + multiplication) so this can safely be used in crypto. Applications that wish + to multiply upper with lower halves can `Shuffle01` one of the operands; on + x86 that is expected to be latency-neutral. + +* `V`: `u64` \ + V **CLMulUpper**(V a, V b): as CLMulLower, but multiplies the + upper 64 bits of each 128-bit block. + +## Preprocessor macros + +* `HWY_ALIGN`: Prefix for stack-allocated (i.e. automatic storage duration) + arrays to ensure they have suitable alignment for Load()/Store(). This is + specific to `HWY_TARGET` and should only be used inside `HWY_NAMESPACE`. + + Arrays should also only be used for partial (<= 128-bit) vectors, or + `LoadDup128`, because full vectors may be too large for the stack and should + be heap-allocated instead (see aligned_allocator.h). + + Example: `HWY_ALIGN float lanes[4];` + +* `HWY_ALIGN_MAX`: as `HWY_ALIGN`, but independent of `HWY_TARGET` and may be + used outside `HWY_NAMESPACE`. + +## Advanced macros + +* `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as + `#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out. + +The following indicate support for certain lane types and expand to 1 or 0: + +* `HWY_HAVE_INTEGER64`: support for 64-bit signed/unsigned integer lanes. +* `HWY_HAVE_FLOAT16`: support for 16-bit floating-point lanes. +* `HWY_HAVE_FLOAT64`: support for double-precision floating-point lanes. + +The above were previously known as `HWY_CAP_INTEGER64`, `HWY_CAP_FLOAT16`, and +`HWY_CAP_FLOAT64`, respectively. Those `HWY_CAP_*` names are DEPRECATED. + +* `HWY_HAVE_SCALABLE` indicates vector sizes are unknown at compile time, and + determined by the CPU. + +* `HWY_MEM_OPS_MIGHT_FAULT` is 1 iff `MaskedLoad` may trigger a (page) fault + when attempting to load lanes from unmapped memory, even if the + corresponding mask element is false. This is the case on ASAN/MSAN builds, + AMD x86 prior to AVX-512, and ARM NEON. If so, users can prevent faults by + ensuring memory addresses are aligned to the vector size or at least padded + (allocation size increased by at least `Lanes(d)`. + +* `HWY_NATIVE_FMA` expands to 1 if the `MulAdd` etc. ops use native fused + multiply-add. Otherwise, `MulAdd(f, m, a)` is implemented as `Add(Mul(f, m), + a)`. Checking this can be useful for increasing the tolerance of expected + results (around 1E-5 or 1E-6). + +The following were used to signal the maximum number of lanes for certain +operations, but this is no longer necessary (nor possible on SVE/RVV), so they +are DEPRECATED: + +* `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits. +* `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits. + +## Detecting supported targets + +`SupportedTargets()` returns a non-cached (re-initialized on each call) bitfield +of the targets supported on the current CPU, detected using CPUID on x86 or +equivalent. This may include targets that are not in `HWY_TARGETS`, and vice +versa. If there is no overlap the binary will likely crash. This can only happen +if: + +* the specified baseline is not supported by the current CPU, which + contradicts the definition of baseline, so the configuration is invalid; or +* the baseline does not include the enabled/attainable target(s), which are + also not supported by the current CPU, and baseline targets (in particular + `HWY_SCALAR`) were explicitly disabled. + +## Advanced configuration macros + +The following macros govern which targets to generate. Unless specified +otherwise, they may be defined per translation unit, e.g. to disable >128 bit +vectors in modules that do not benefit from them (if bandwidth-limited or only +called occasionally). This is safe because `HWY_TARGETS` always includes at +least one baseline target which `HWY_EXPORT` can use. + +* `HWY_DISABLE_CACHE_CONTROL` makes the cache-control functions no-ops. +* `HWY_DISABLE_BMI2_FMA` prevents emitting BMI/BMI2/FMA instructions. This + allows using AVX2 in VMs that do not support the other instructions, but + only if defined for all translation units. + +The following `*_TARGETS` are zero or more `HWY_Target` bits and can be defined +as an expression, e.g. `-DHWY_DISABLED_TARGETS=(HWY_SSE4|HWY_AVX3)`. + +* `HWY_BROKEN_TARGETS` defaults to a blocklist of known compiler bugs. + Defining to 0 disables the blocklist. + +* `HWY_DISABLED_TARGETS` defaults to zero. This allows explicitly disabling + targets without interfering with the blocklist. + +* `HWY_BASELINE_TARGETS` defaults to the set whose predefined macros are + defined (i.e. those for which the corresponding flag, e.g. -mavx2, was + passed to the compiler). If specified, this should be the same for all + translation units, otherwise the safety check in SupportedTargets (that all + enabled baseline targets are supported) may be inaccurate. + +Zero or one of the following macros may be defined to replace the default +policy for selecting `HWY_TARGETS`: + +* `HWY_COMPILE_ONLY_EMU128` selects only `HWY_EMU128`, which avoids intrinsics + but implements all ops using standard C++. +* `HWY_COMPILE_ONLY_SCALAR` selects only `HWY_SCALAR`, which implements + single-lane-only ops using standard C++. +* `HWY_COMPILE_ONLY_STATIC` selects only `HWY_STATIC_TARGET`, which + effectively disables dynamic dispatch. +* `HWY_COMPILE_ALL_ATTAINABLE` selects all attainable targets (i.e. enabled + and permitted by the compiler, independently of autovectorization), which + maximizes coverage in tests. + +At most one `HWY_COMPILE_ONLY_*` may be defined. `HWY_COMPILE_ALL_ATTAINABLE` +may also be defined even if one of `HWY_COMPILE_ONLY_*` is, but will then be +ignored. + +If none are defined, but `HWY_IS_TEST` is defined, the default is +`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable +targets except any non-best baseline (typically `HWY_SCALAR`), which reduces +code size. + +## Compiler support + +Clang and GCC require e.g. -mavx2 flags in order to use SIMD intrinsics. +However, this enables AVX2 instructions in the entire translation unit, which +may violate the one-definition rule and cause crashes. Instead, we use +target-specific attributes introduced via #pragma. Function using SIMD must +reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively, +individual functions or lambdas may be prefixed with `HWY_ATTR`. + +If you know the SVE vector width and are using static dispatch, you can specify +`-march=armv9-a+sve2-aes -msve-vector-bits=128` and Highway will then use +`HWY_SVE2_128` as the baseline. Similarly, `-march=armv8.2-a+sve +-msve-vector-bits=256` enables the `HWY_SVE_256` specialization for Neoverse V1. +Note that these flags are unnecessary when using dynamic dispatch. Highway will +automatically detect and dispatch to the best available target, including +`HWY_SVE2_128` or `HWY_SVE_256`. + +Immediates (compile-time constants) are specified as template arguments to avoid +constant-propagation issues with Clang on ARM. + +## Type traits + +* `IsFloat()` returns true if the `T` is a floating-point type. +* `IsSigned()` returns true if the `T` is a signed or floating-point type. +* `LimitsMin/Max()` return the smallest/largest value representable in + integer `T`. +* `SizeTag` is an empty struct, used to select overloaded functions + appropriate for `N` bytes. + +* `MakeUnsigned` is an alias for an unsigned type of the same size as `T`. + +* `MakeSigned` is an alias for a signed type of the same size as `T`. + +* `MakeFloat` is an alias for a floating-point type of the same size as + `T`. + +* `MakeWide` is an alias for a type with twice the size of `T` and the same + category (unsigned/signed/float). + +* `MakeNarrow` is an alias for a type with half the size of `T` and the + same category (unsigned/signed/float). + +## Memory allocation + +`AllocateAligned(items)` returns a unique pointer to newly allocated memory +for `items` elements of POD type `T`. The start address is aligned as required +by `Load/Store`. Furthermore, successive allocations are not congruent modulo a +platform-specific alignment. This helps prevent false dependencies or cache +conflicts. The memory allocation is analogous to using `malloc()` and `free()` +with a `std::unique_ptr` since the returned items are *not* initialized or +default constructed and it is released using `FreeAlignedBytes()` without +calling `~T()`. + +`MakeUniqueAligned(Args&&... args)` creates a single object in newly +allocated aligned memory as above but constructed passing the `args` argument to +`T`'s constructor and returning a unique pointer to it. This is analogous to +using `std::make_unique` with `new` but for aligned memory since the object is +constructed and later destructed when the unique pointer is deleted. Typically +this type `T` is a struct containing multiple members with `HWY_ALIGN` or +`HWY_ALIGN_MAX`, or arrays whose lengths are known to be a multiple of the +vector size. + +`MakeUniqueAlignedArray(size_t items, Args&&... args)` creates an array of +objects in newly allocated aligned memory as above and constructs every element +of the new array using the passed constructor parameters, returning a unique +pointer to the array. Note that only the first element is guaranteed to be +aligned to the vector size; because there is no padding between elements, +the alignment of the remaining elements depends on the size of `T`. diff --git a/g3doc/release_testing_process.md b/g3doc/release_testing_process.md new file mode 100644 index 0000000..95836f9 --- /dev/null +++ b/g3doc/release_testing_process.md @@ -0,0 +1,37 @@ +## Release testing process + +We run the following before a release: + +### Windows x86 + +``` +run_tests.bat +``` + +### Linux x86 + +#### Clang, GCC, ARM cross compile + +``` +./run_tests.sh +``` + +#### JPEG XL clang (debug, asan, msan) + +``` +for VER in 9 10 11 12 13; do + rm -rf build_debug$VER && CC=clang-$VER CXX=clang++-$VER BUILD_DIR=build_debug$VER SKIP_TEST=1 ./ci.sh debug && ./ci.sh test -R PassesTest && rm -rf build_debug$VER + rm -rf build_asan$VER && CC=clang-$VER CXX=clang++-$VER BUILD_DIR=build_asan$VER ./ci.sh asan && rm -rf build_asan$VER + rm -rf build_msan$VER && CC=clang-$VER CXX=clang++-$VER BUILD_DIR=build_msan$VER ./ci.sh msan && rm -rf build_msan$VER +done +``` + +#### JPEG XL tests + +``` +git -C third_party/highway pull -r origin master +git diff +vi deps.sh +git commit -a -m"Highway test" +git push git@github.com:$USER/libjxl.git HEAD:main --force +``` diff --git a/hwy.gni b/hwy.gni new file mode 100644 index 0000000..b1c954e --- /dev/null +++ b/hwy.gni @@ -0,0 +1,53 @@ +_hwy = get_path_info("hwy", "abspath") + +hwy_public = [ + # Public + "$_hwy/aligned_allocator.h", + "$_hwy/base.h", + "$_hwy/cache_control.h", + "$_hwy/per_target.h", + "$_hwy/print.h", + + # Public, textual + "$_hwy/foreach_target.h", + "$_hwy/highway_export.h", + "$_hwy/highway.h", + "$_hwy/print-inl.h", + + # Private + "$_hwy/detect_compiler_arch.h", + "$_hwy/detect_targets.h", + "$_hwy/targets.h", + + # Private, textual: + "$_hwy/ops/arm_neon-inl.h", + "$_hwy/ops/arm_sve-inl.h", + "$_hwy/ops/emu128-inl.h", + "$_hwy/ops/generic_ops-inl.h", + "$_hwy/ops/scalar-inl.h", + "$_hwy/ops/set_macros-inl.h", + "$_hwy/ops/shared-inl.h", + "$_hwy/ops/x86_128-inl.h", + "$_hwy/ops/x86_256-inl.h", + "$_hwy/ops/x86_512-inl.h", +] + +hwy_sources = [ + "$_hwy/aligned_allocator.cc", + "$_hwy/per_target.cc", + "$_hwy/print.cc", + "$_hwy/targets.cc", +] + +hwy_contrib_public = [ + "$_hwy/contrib/algo/copy-inl.h", + "$_hwy/contrib/algo/find-inl.h", + "$_hwy/contrib/algo/transform-inl.h", + "$_hwy/contrib/dot/dot-inl.h", + "$_hwy/contrib/image/image.h", + "$_hwy/contrib/math/math-inl.h", +] + +hwy_contrib_sources = [ + "$_hwy/contrib/image/image.cc", +] diff --git a/hwy/aligned_allocator.cc b/hwy/aligned_allocator.cc new file mode 100644 index 0000000..7b99479 --- /dev/null +++ b/hwy/aligned_allocator.cc @@ -0,0 +1,152 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/aligned_allocator.h" + +#include +#include +#include +#include // malloc + +#include +#include + +#include "hwy/base.h" + +namespace hwy { +namespace { + +#if HWY_ARCH_RVV && defined(__riscv_vector) +// Not actually an upper bound on the size, but this value prevents crossing a +// 4K boundary (relevant on Andes). +constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096); +#else +constexpr size_t kAlignment = HWY_ALIGNMENT; +#endif + +#if HWY_ARCH_X86 +// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful +// if this is used for single-vector allocations. 256 is more reasonable. +constexpr size_t kAlias = kAlignment * 4; +#else +constexpr size_t kAlias = kAlignment; +#endif + +#pragma pack(push, 1) +struct AllocationHeader { + void* allocated; + size_t payload_size; +}; +#pragma pack(pop) + +// Returns a 'random' (cyclical) offset for AllocateAlignedBytes. +size_t NextAlignedOffset() { + static std::atomic next{0}; + constexpr uint32_t kGroups = kAlias / kAlignment; + const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups; + const size_t offset = kAlignment * group; + HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias); + return offset; +} + +} // namespace + +HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size, + AllocPtr alloc_ptr, void* opaque_ptr) { + HWY_ASSERT(payload_size != 0); // likely a bug in caller + if (payload_size >= std::numeric_limits::max() / 2) { + HWY_DASSERT(false && "payload_size too large"); + return nullptr; + } + + size_t offset = NextAlignedOffset(); + + // What: | misalign | unused | AllocationHeader |payload + // Size: |<= kAlias | offset |payload_size + // ^allocated.^aligned.^header............^payload + // The header must immediately precede payload, which must remain aligned. + // To avoid wasting space, the header resides at the end of `unused`, + // which therefore cannot be empty (offset == 0). + if (offset == 0) { + offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment) + static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up"); + } + + const size_t allocated_size = kAlias + offset + payload_size; + void* allocated; + if (alloc_ptr == nullptr) { + allocated = malloc(allocated_size); + } else { + allocated = (*alloc_ptr)(opaque_ptr, allocated_size); + } + if (allocated == nullptr) return nullptr; + // Always round up even if already aligned - we already asked for kAlias + // extra bytes and there's no way to give them back. + uintptr_t aligned = reinterpret_cast(allocated) + kAlias; + static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2"); + static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias"); + aligned &= ~(kAlias - 1); + + const uintptr_t payload = aligned + offset; // still aligned + + // Stash `allocated` and payload_size inside header for FreeAlignedBytes(). + // The allocated_size can be reconstructed from the payload_size. + AllocationHeader* header = reinterpret_cast(payload) - 1; + header->allocated = allocated; + header->payload_size = payload_size; + + return HWY_ASSUME_ALIGNED(reinterpret_cast(payload), kAlignment); +} + +HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, + FreePtr free_ptr, void* opaque_ptr) { + if (aligned_pointer == nullptr) return; + + const uintptr_t payload = reinterpret_cast(aligned_pointer); + HWY_DASSERT(payload % kAlignment == 0); + const AllocationHeader* header = + reinterpret_cast(payload) - 1; + + if (free_ptr == nullptr) { + free(header->allocated); + } else { + (*free_ptr)(opaque_ptr, header->allocated); + } +} + +// static +HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, + FreePtr free_ptr, + void* opaque_ptr, + ArrayDeleter deleter) { + if (aligned_pointer == nullptr) return; + + const uintptr_t payload = reinterpret_cast(aligned_pointer); + HWY_DASSERT(payload % kAlignment == 0); + const AllocationHeader* header = + reinterpret_cast(payload) - 1; + + if (deleter) { + (*deleter)(aligned_pointer, header->payload_size); + } + + if (free_ptr == nullptr) { + free(header->allocated); + } else { + (*free_ptr)(opaque_ptr, header->allocated); + } +} + +} // namespace hwy diff --git a/hwy/aligned_allocator.h b/hwy/aligned_allocator.h new file mode 100644 index 0000000..f6bfca1 --- /dev/null +++ b/hwy/aligned_allocator.h @@ -0,0 +1,212 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ +#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ + +// Memory allocator with support for alignment and offsets. + +#include + +#include + +#include "hwy/highway_export.h" + +namespace hwy { + +// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which +// requires a literal. This matches typical L1 cache line sizes, which prevents +// false sharing. +#define HWY_ALIGNMENT 64 + +// Pointers to functions equivalent to malloc/free with an opaque void* passed +// to them. +using AllocPtr = void* (*)(void* opaque, size_t bytes); +using FreePtr = void (*)(void* opaque, void* memory); + +// Returns null or a pointer to at least `payload_size` (which can be zero) +// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and +// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain +// memory or malloc() if it is null. +HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size, + AllocPtr alloc_ptr, void* opaque_ptr); + +// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it +// must have been returned from a previous call to `AllocateAlignedBytes`. +// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if +// `free_ptr` function is null, uses the default free(). +HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, + FreePtr free_ptr, void* opaque_ptr); + +// Class that deletes the aligned pointer passed to operator() calling the +// destructor before freeing the pointer. This is equivalent to the +// std::default_delete but for aligned objects. For a similar deleter equivalent +// to free() for aligned memory see AlignedFreer(). +class AlignedDeleter { + public: + AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {} + AlignedDeleter(FreePtr free_ptr, void* opaque_ptr) + : free_(free_ptr), opaque_ptr_(opaque_ptr) {} + + template + void operator()(T* aligned_pointer) const { + return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_, + TypedArrayDeleter); + } + + private: + template + static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) { + size_t elems = size_in_bytes / sizeof(T); + for (size_t i = 0; i < elems; i++) { + // Explicitly call the destructor on each element. + (static_cast(ptr) + i)->~T(); + } + } + + // Function prototype that calls the destructor for each element in a typed + // array. TypeArrayDeleter would match this prototype. + using ArrayDeleter = void (*)(void* t_ptr, size_t t_size); + + HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer, + FreePtr free_ptr, + void* opaque_ptr, + ArrayDeleter deleter); + + FreePtr free_; + void* opaque_ptr_; +}; + +// Unique pointer to T with custom aligned deleter. This can be a single +// element U or an array of element if T is a U[]. The custom aligned deleter +// will call the destructor on U or each element of a U[] in the array case. +template +using AlignedUniquePtr = std::unique_ptr; + +// Aligned memory equivalent of make_unique using the custom allocators +// alloc/free with the passed `opaque` pointer. This function calls the +// constructor with the passed Args... and calls the destructor of the object +// when the AlignedUniquePtr is destroyed. +template +AlignedUniquePtr MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free, + void* opaque, Args&&... args) { + T* ptr = static_cast(AllocateAlignedBytes(sizeof(T), alloc, opaque)); + return AlignedUniquePtr(new (ptr) T(std::forward(args)...), + AlignedDeleter(free, opaque)); +} + +// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free +// functions. +template +AlignedUniquePtr MakeUniqueAligned(Args&&... args) { + T* ptr = static_cast(AllocateAlignedBytes( + sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr)); + return AlignedUniquePtr(new (ptr) T(std::forward(args)...), + AlignedDeleter()); +} + +// Helpers for array allocators (avoids overflow) +namespace detail { + +// Returns x such that 1u << x == n (if n is a power of two). +static inline constexpr size_t ShiftCount(size_t n) { + return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); +} + +template +T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) { + constexpr size_t size = sizeof(T); + + constexpr bool is_pow2 = (size & (size - 1)) == 0; + constexpr size_t bits = ShiftCount(size); + static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect"); + + const size_t bytes = is_pow2 ? items << bits : items * size; + const size_t check = is_pow2 ? bytes >> bits : bytes / size; + if (check != items) { + return nullptr; // overflowed + } + return static_cast(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr)); +} + +} // namespace detail + +// Aligned memory equivalent of make_unique for array types using the +// custom allocators alloc/free. This function calls the constructor with the +// passed Args... on every created item. The destructor of each element will be +// called when the AlignedUniquePtr is destroyed. +template +AlignedUniquePtr MakeUniqueAlignedArrayWithAlloc( + size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { + T* ptr = detail::AllocateAlignedItems(items, alloc, opaque); + if (ptr != nullptr) { + for (size_t i = 0; i < items; i++) { + new (ptr + i) T(std::forward(args)...); + } + } + return AlignedUniquePtr(ptr, AlignedDeleter(free, opaque)); +} + +template +AlignedUniquePtr MakeUniqueAlignedArray(size_t items, Args&&... args) { + return MakeUniqueAlignedArrayWithAlloc( + items, nullptr, nullptr, nullptr, std::forward(args)...); +} + +// Custom deleter for std::unique_ptr equivalent to using free() as a deleter +// but for aligned memory. +class AlignedFreer { + public: + // Pass address of this to ctor to skip deleting externally-owned memory. + static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {} + + AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {} + AlignedFreer(FreePtr free_ptr, void* opaque_ptr) + : free_(free_ptr), opaque_ptr_(opaque_ptr) {} + + template + void operator()(T* aligned_pointer) const { + // TODO(deymo): assert that we are using a POD type T. + FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_); + } + + private: + FreePtr free_; + void* opaque_ptr_; +}; + +// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD +// data use AlignedUniquePtr. +template +using AlignedFreeUniquePtr = std::unique_ptr; + +// Allocate an aligned and uninitialized array of POD values as a unique_ptr. +// Upon destruction of the unique_ptr the aligned array will be freed. +template +AlignedFreeUniquePtr AllocateAligned(const size_t items, AllocPtr alloc, + FreePtr free, void* opaque) { + return AlignedFreeUniquePtr( + detail::AllocateAlignedItems(items, alloc, opaque), + AlignedFreer(free, opaque)); +} + +// Same as previous AllocateAligned(), using default allocate/free functions. +template +AlignedFreeUniquePtr AllocateAligned(const size_t items) { + return AllocateAligned(items, nullptr, nullptr, nullptr); +} + +} // namespace hwy +#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ diff --git a/hwy/aligned_allocator_test.cc b/hwy/aligned_allocator_test.cc new file mode 100644 index 0000000..ced08e7 --- /dev/null +++ b/hwy/aligned_allocator_test.cc @@ -0,0 +1,278 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/aligned_allocator.h" + +#include + +#include +#include +#include +#include + +#include "gtest/gtest.h" + +namespace { + +// Sample object that keeps track on an external counter of how many times was +// the explicit constructor and destructor called. +template +class SampleObject { + public: + SampleObject() { data_[0] = 'a'; } + explicit SampleObject(int* counter) : counter_(counter) { + if (counter) (*counter)++; + data_[0] = 'b'; + } + + ~SampleObject() { + if (counter_) (*counter_)--; + } + + static_assert(N > sizeof(int*), "SampleObject size too small."); + int* counter_ = nullptr; + char data_[N - sizeof(int*)]; +}; + +class FakeAllocator { + public: + // static AllocPtr and FreePtr member to be used with the alligned + // allocator. These functions calls the private non-static members. + static void* StaticAlloc(void* opaque, size_t bytes) { + return reinterpret_cast(opaque)->Alloc(bytes); + } + static void StaticFree(void* opaque, void* memory) { + return reinterpret_cast(opaque)->Free(memory); + } + + // Returns the number of pending allocations to be freed. + size_t PendingAllocs() { return allocs_.size(); } + + private: + void* Alloc(size_t bytes) { + void* ret = malloc(bytes); + allocs_.insert(ret); + return ret; + } + void Free(void* memory) { + if (!memory) return; + EXPECT_NE(allocs_.end(), allocs_.find(memory)); + allocs_.erase(memory); + free(memory); + } + + std::set allocs_; +}; + +} // namespace + +namespace hwy { + +class AlignedAllocatorTest : public testing::Test {}; + +TEST(AlignedAllocatorTest, FreeNullptr) { + // Calling free with a nullptr is always ok. + FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr, + /*opaque_ptr=*/nullptr); +} + +TEST(AlignedAllocatorTest, Log2) { + EXPECT_EQ(0u, detail::ShiftCount(1)); + EXPECT_EQ(1u, detail::ShiftCount(2)); + EXPECT_EQ(3u, detail::ShiftCount(8)); +} + +// Allocator returns null when it detects overflow of items * sizeof(T). +TEST(AlignedAllocatorTest, Overflow) { + constexpr size_t max = ~size_t(0); + constexpr size_t msb = (max >> 1) + 1; + using Size5 = std::array; + using Size10 = std::array; + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 2, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 3, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 4, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb + 1, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb / 4, nullptr, nullptr)); +} + +TEST(AlignedAllocatorTest, AllocDefaultPointers) { + const size_t kSize = 7777; + void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr, + /*opaque_ptr=*/nullptr); + ASSERT_NE(nullptr, ptr); + // Make sure the pointer is actually aligned. + EXPECT_EQ(0U, reinterpret_cast(ptr) % HWY_ALIGNMENT); + char* p = static_cast(ptr); + size_t ret = 0; + for (size_t i = 0; i < kSize; i++) { + // Performs a computation using p[] to prevent it being optimized away. + p[i] = static_cast(i & 0x7F); + if (i) ret += static_cast(p[i] * p[i - 1]); + } + EXPECT_NE(0U, ret); + FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr); +} + +TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) { + AlignedUniquePtr> ptr(nullptr, AlignedDeleter()); + AlignedUniquePtr[]> arr(nullptr, AlignedDeleter()); +} + +TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) { + AlignedFreeUniquePtr> ptr(nullptr, AlignedFreer()); + AlignedFreeUniquePtr[]> arr(nullptr, AlignedFreer()); +} + +TEST(AlignedAllocatorTest, CustomAlloc) { + FakeAllocator fake_alloc; + + const size_t kSize = 7777; + void* ptr = + AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc); + ASSERT_NE(nullptr, ptr); + // We should have only requested one alloc from the allocator. + EXPECT_EQ(1U, fake_alloc.PendingAllocs()); + // Make sure the pointer is actually aligned. + EXPECT_EQ(0U, reinterpret_cast(ptr) % HWY_ALIGNMENT); + FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc); + EXPECT_EQ(0U, fake_alloc.PendingAllocs()); +} + +TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) { + { + auto ptr = MakeUniqueAligned>(); + // Default constructor sets the data_[0] to 'a'. + EXPECT_EQ('a', ptr->data_[0]); + EXPECT_EQ(nullptr, ptr->counter_); + } +} + +TEST(AlignedAllocatorTest, MakeUniqueAligned) { + int counter = 0; + { + // Creates the object, initializes it with the explicit constructor and + // returns an unique_ptr to it. + auto ptr = MakeUniqueAligned>(&counter); + EXPECT_EQ(1, counter); + // Custom constructor sets the data_[0] to 'b'. + EXPECT_EQ('b', ptr->data_[0]); + } + EXPECT_EQ(0, counter); +} + +TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) { + int counter = 0; + { + // Creates the array of objects and initializes them with the explicit + // constructor. + auto arr = MakeUniqueAlignedArray>(7, &counter); + EXPECT_EQ(7, counter); + for (size_t i = 0; i < 7; i++) { + // Custom constructor sets the data_[0] to 'b'. + EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i; + } + } + EXPECT_EQ(0, counter); +} + +TEST(AlignedAllocatorTest, AllocSingleInt) { + auto ptr = AllocateAligned(1); + ASSERT_NE(nullptr, ptr.get()); + EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % HWY_ALIGNMENT); + // Force delete of the unique_ptr now to check that it doesn't crash. + ptr.reset(nullptr); + EXPECT_EQ(nullptr, ptr.get()); +} + +TEST(AlignedAllocatorTest, AllocMultipleInt) { + const size_t kSize = 7777; + auto ptr = AllocateAligned(kSize); + ASSERT_NE(nullptr, ptr.get()); + EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % HWY_ALIGNMENT); + // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the + // underlying type chosen by AllocateAligned() for the std::unique_ptr. + EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1])); + + size_t ret = 0; + for (size_t i = 0; i < kSize; i++) { + // Performs a computation using ptr[] to prevent it being optimized away. + ptr[i] = static_cast(i); + if (i) ret += ptr[i] * ptr[i - 1]; + } + EXPECT_NE(0U, ret); +} + +TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) { + int counter = 0; + { + // This doesn't call the constructor. + auto obj = AllocateAligned>(1); + obj[0].counter_ = &counter; + } + // Destroying the unique_ptr shouldn't have called the destructor of the + // SampleObject<24>. + EXPECT_EQ(0, counter); +} + +TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) { + FakeAllocator fake_alloc; + int counter = 0; + { + // Creates the array of objects and initializes them with the explicit + // constructor. + auto arr = MakeUniqueAlignedArrayWithAlloc>( + 7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc, + &counter); + ASSERT_NE(nullptr, arr.get()); + // An array should still only call a single allocation. + EXPECT_EQ(1u, fake_alloc.PendingAllocs()); + EXPECT_EQ(7, counter); + for (size_t i = 0; i < 7; i++) { + // Custom constructor sets the data_[0] to 'b'. + EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i; + } + } + EXPECT_EQ(0, counter); + EXPECT_EQ(0u, fake_alloc.PendingAllocs()); +} + +TEST(AlignedAllocatorTest, DefaultInit) { + // The test is whether this compiles. Default-init is useful for output params + // and per-thread storage. + std::vector> ptrs; + std::vector> free_ptrs; + ptrs.resize(128); + free_ptrs.resize(128); + // The following is to prevent elision of the pointers. + std::mt19937 rng(129); // Emscripten lacks random_device. + std::uniform_int_distribution dist(0, 127); + ptrs[dist(rng)] = MakeUniqueAlignedArray(123); + free_ptrs[dist(rng)] = AllocateAligned(456); + // "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64. + const auto addr1 = reinterpret_cast(ptrs[dist(rng)].get()); + const auto addr2 = reinterpret_cast(free_ptrs[dist(rng)].get()); + constexpr size_t kBits = sizeof(uintptr_t) * 8; + EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1), + (addr2 >> (kBits - 1)) >> (kBits - 1)); +} + +} // namespace hwy diff --git a/hwy/base.h b/hwy/base.h new file mode 100644 index 0000000..0a4491e --- /dev/null +++ b/hwy/base.h @@ -0,0 +1,946 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_BASE_H_ +#define HIGHWAY_HWY_BASE_H_ + +// For SIMD module implementations and their callers, target-independent. + +#include +#include + +#include "hwy/detect_compiler_arch.h" +#include "hwy/highway_export.h" + +#if HWY_COMPILER_MSVC +#include // memcpy +#endif +#if HWY_ARCH_X86 +#include +#endif + +//------------------------------------------------------------------------------ +// Compiler-specific definitions + +#define HWY_STR_IMPL(macro) #macro +#define HWY_STR(macro) HWY_STR_IMPL(macro) + +#if HWY_COMPILER_MSVC + +#include + +#define HWY_RESTRICT __restrict +#define HWY_INLINE __forceinline +#define HWY_NOINLINE __declspec(noinline) +#define HWY_FLATTEN +#define HWY_NORETURN __declspec(noreturn) +#define HWY_LIKELY(expr) (expr) +#define HWY_UNLIKELY(expr) (expr) +#define HWY_PRAGMA(tokens) __pragma(tokens) +#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) +#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) +#define HWY_MAYBE_UNUSED +#define HWY_HAS_ASSUME_ALIGNED 0 +#if (_MSC_VER >= 1700) +#define HWY_MUST_USE_RESULT _Check_return_ +#else +#define HWY_MUST_USE_RESULT +#endif + +#else + +#define HWY_RESTRICT __restrict__ +// force inlining without optimization enabled creates very inefficient code +// that can cause compiler timeout +#ifdef __OPTIMIZE__ +#define HWY_INLINE inline __attribute__((always_inline)) +#else +#define HWY_INLINE inline +#endif +#define HWY_NOINLINE __attribute__((noinline)) +#define HWY_FLATTEN __attribute__((flatten)) +#define HWY_NORETURN __attribute__((noreturn)) +#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#define HWY_PRAGMA(tokens) _Pragma(#tokens) +#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) +#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) +// Encountered "attribute list cannot appear here" when using the C++17 +// [[maybe_unused]], so only use the old style attribute for now. +#define HWY_MAYBE_UNUSED __attribute__((unused)) +#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) + +#endif // !HWY_COMPILER_MSVC + +//------------------------------------------------------------------------------ +// Builtin/attributes + +// Enables error-checking of format strings. +#if HWY_HAS_ATTRIBUTE(__format__) +#define HWY_FORMAT(idx_fmt, idx_arg) \ + __attribute__((__format__(__printf__, idx_fmt, idx_arg))) +#else +#define HWY_FORMAT(idx_fmt, idx_arg) +#endif + +// Returns a void* pointer which the compiler then assumes is N-byte aligned. +// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); +// +// The assignment semantics are required by GCC/Clang. ICC provides an in-place +// __assume_aligned, whereas MSVC's __assume appears unsuitable. +#if HWY_HAS_BUILTIN(__builtin_assume_aligned) +#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) +#else +#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ +#endif + +// Clang and GCC require attributes on each function into which SIMD intrinsics +// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and +// automatic annotation via pragmas. +#if HWY_COMPILER_CLANG +#define HWY_PUSH_ATTRIBUTES(targets_str) \ + HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ + apply_to = function)) +#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) +#elif HWY_COMPILER_GCC +#define HWY_PUSH_ATTRIBUTES(targets_str) \ + HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) +#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) +#else +#define HWY_PUSH_ATTRIBUTES(targets_str) +#define HWY_POP_ATTRIBUTES +#endif + +//------------------------------------------------------------------------------ +// Macros + +#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED + +#define HWY_CONCAT_IMPL(a, b) a##b +#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) + +#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) +#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b)) + +#if HWY_COMPILER_GCC_ACTUAL +// nielskm: GCC does not support '#pragma GCC unroll' without the factor. +#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor) +#define HWY_DEFAULT_UNROLL HWY_UNROLL(4) +#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX +#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor) +#define HWY_DEFAULT_UNROLL HWY_UNROLL() +#else +#define HWY_UNROLL(factor) +#define HWY_DEFAULT_UNROLL +#endif + + +// Compile-time fence to prevent undesirable code reordering. On Clang x86, the +// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence +// does, without generating code. +#if HWY_ARCH_X86 +#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel) +#else +// TODO(janwas): investigate alternatives. On ARM, the above generates barriers. +#define HWY_FENCE +#endif + +// 4 instances of a given literal value, useful as input to LoadDup128. +#define HWY_REP4(literal) literal, literal, literal, literal + +#define HWY_ABORT(format, ...) \ + ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) + +// Always enabled. +#define HWY_ASSERT(condition) \ + do { \ + if (!(condition)) { \ + HWY_ABORT("Assert %s", #condition); \ + } \ + } while (0) + +#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) +#define HWY_IS_MSAN 1 +#else +#define HWY_IS_MSAN 0 +#endif + +#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) +#define HWY_IS_ASAN 1 +#else +#define HWY_IS_ASAN 0 +#endif + +#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) +#define HWY_IS_TSAN 1 +#else +#define HWY_IS_TSAN 0 +#endif + +// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo. +// You can disable MSAN by adding this attribute to the function that fails. +#if HWY_IS_MSAN +#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory)) +#else +#define HWY_ATTR_NO_MSAN +#endif + +// For enabling HWY_DASSERT and shortening tests in slower debug builds +#if !defined(HWY_IS_DEBUG_BUILD) +// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent +// MSVC defines NDEBUG (if not, could instead check _DEBUG). +#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \ + HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__) +#define HWY_IS_DEBUG_BUILD 1 +#else +#define HWY_IS_DEBUG_BUILD 0 +#endif +#endif // HWY_IS_DEBUG_BUILD + +#if HWY_IS_DEBUG_BUILD +#define HWY_DASSERT(condition) HWY_ASSERT(condition) +#else +#define HWY_DASSERT(condition) \ + do { \ + } while (0) +#endif + +namespace hwy { + +//------------------------------------------------------------------------------ +// kMaxVectorSize (undocumented, pending removal) + +#if HWY_ARCH_X86 +static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512 +#elif HWY_ARCH_RVV && defined(__riscv_vector) +// Not actually an upper bound on the size. +static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096; +#else +static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; +#endif + +//------------------------------------------------------------------------------ +// Alignment + +// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays +// should be allocated dynamically via aligned_allocator.h because Lanes() may +// exceed the stack size. +#if HWY_ARCH_X86 +#define HWY_ALIGN_MAX alignas(64) +#elif HWY_ARCH_RVV && defined(__riscv_vector) +#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned +#else +#define HWY_ALIGN_MAX alignas(16) +#endif + +//------------------------------------------------------------------------------ +// Lane types + +// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name +// by concatenating base type and bits. + +#pragma pack(push, 1) + +// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html): +// always supported on aarch64, for v7 only if -mfp16-format is given. +#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC) +using float16_t = __fp16; +// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets. +// Required for Clang RVV if the float16 extension is used. +#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh) +using float16_t = _Float16; +// Otherwise emulate +#else +struct float16_t { + uint16_t bits; +}; +#endif + +struct bfloat16_t { + uint16_t bits; +}; + +#pragma pack(pop) + +using float32_t = float; +using float64_t = double; + +#pragma pack(push, 1) + +// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it: +// https://reviews.llvm.org/D86310 +struct alignas(16) uint128_t { + uint64_t lo; // little-endian layout + uint64_t hi; +}; + +// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key +// field is to be compared (Lt128Upper instead of Lt128). +struct alignas(16) K64V64 { + uint64_t value; // little-endian layout + uint64_t key; +}; + +// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier +// than when considering both to be a 64-bit key. +struct alignas(8) K32V32 { + uint32_t value; // little-endian layout + uint32_t key; +}; + +#pragma pack(pop) + +static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a, + const uint128_t& b) { + return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi; +} +// Required for std::greater. +static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a, + const uint128_t& b) { + return b < a; +} +static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a, + const uint128_t& b) { + return a.lo == b.lo && a.hi == b.hi; +} + +static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a, + const K64V64& b) { + return a.key < b.key; +} +// Required for std::greater. +static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a, + const K64V64& b) { + return b < a; +} + +static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a, + const K32V32& b) { + return a.key < b.key; +} +// Required for std::greater. +static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a, + const K32V32& b) { + return b < a; +} + +//------------------------------------------------------------------------------ +// Controlling overload resolution (SFINAE) + +template +struct EnableIfT {}; +template <> +struct EnableIfT { + using type = void; +}; + +template +using EnableIf = typename EnableIfT::type; + +template +struct IsSameT { + enum { value = 0 }; +}; + +template +struct IsSameT { + enum { value = 1 }; +}; + +template +HWY_API constexpr bool IsSame() { + return IsSameT::value; +} + +// Insert into template/function arguments to enable this overload only for +// vectors of AT MOST this many bits. +// +// Note that enabling for exactly 128 bits is unnecessary because a function can +// simply be overloaded with Vec128 and/or Full128 tag. Enabling for other +// sizes (e.g. 64 bit) can be achieved via Simd. +#define HWY_IF_LE128(T, N) hwy::EnableIf* = nullptr +#define HWY_IF_LE64(T, N) hwy::EnableIf* = nullptr +#define HWY_IF_LE32(T, N) hwy::EnableIf* = nullptr +#define HWY_IF_GE32(T, N) hwy::EnableIf= 4>* = nullptr +#define HWY_IF_GE64(T, N) hwy::EnableIf= 8>* = nullptr +#define HWY_IF_GE128(T, N) hwy::EnableIf= 16>* = nullptr +#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr + +#define HWY_IF_UNSIGNED(T) hwy::EnableIf()>* = nullptr +#define HWY_IF_SIGNED(T) \ + hwy::EnableIf() && !IsFloat()>* = nullptr +#define HWY_IF_FLOAT(T) hwy::EnableIf()>* = nullptr +#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf()>* = nullptr + +#define HWY_IF_LANE_SIZE(T, bytes) \ + hwy::EnableIf* = nullptr +#define HWY_IF_NOT_LANE_SIZE(T, bytes) \ + hwy::EnableIf* = nullptr +#define HWY_IF_LANE_SIZE_LT(T, bytes) \ + hwy::EnableIf* = nullptr + +#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \ + hwy::EnableIf* = nullptr + +// Empty struct used as a size tag type. +template +struct SizeTag {}; + +template +struct RemoveConstT { + using type = T; +}; +template +struct RemoveConstT { + using type = T; +}; + +template +using RemoveConst = typename RemoveConstT::type; + +//------------------------------------------------------------------------------ +// Type relations + +namespace detail { + +template +struct Relations; +template <> +struct Relations { + using Unsigned = uint8_t; + using Signed = int8_t; + using Wide = uint16_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint8_t; + using Signed = int8_t; + using Wide = int16_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Wide = uint32_t; + using Narrow = uint8_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Wide = int32_t; + using Narrow = int8_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; + using Wide = uint64_t; + using Narrow = uint16_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; + using Wide = int64_t; + using Narrow = int16_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; + using Wide = uint128_t; + using Narrow = uint32_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; + using Narrow = int32_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint128_t; + using Narrow = uint64_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Float = float16_t; + using Wide = float; + enum { is_signed = 1, is_float = 1 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Wide = float; + enum { is_signed = 1, is_float = 1 }; +}; +template <> +struct Relations { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; + using Wide = double; + using Narrow = float16_t; + enum { is_signed = 1, is_float = 1 }; +}; +template <> +struct Relations { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; + using Narrow = float; + enum { is_signed = 1, is_float = 1 }; +}; + +template +struct TypeFromSize; +template <> +struct TypeFromSize<1> { + using Unsigned = uint8_t; + using Signed = int8_t; +}; +template <> +struct TypeFromSize<2> { + using Unsigned = uint16_t; + using Signed = int16_t; +}; +template <> +struct TypeFromSize<4> { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; +}; +template <> +struct TypeFromSize<8> { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; +}; +template <> +struct TypeFromSize<16> { + using Unsigned = uint128_t; +}; + +} // namespace detail + +// Aliases for types of a different category, but the same size. +template +using MakeUnsigned = typename detail::Relations::Unsigned; +template +using MakeSigned = typename detail::Relations::Signed; +template +using MakeFloat = typename detail::Relations::Float; + +// Aliases for types of the same category, but different size. +template +using MakeWide = typename detail::Relations::Wide; +template +using MakeNarrow = typename detail::Relations::Narrow; + +// Obtain type from its size [bytes]. +template +using UnsignedFromSize = typename detail::TypeFromSize::Unsigned; +template +using SignedFromSize = typename detail::TypeFromSize::Signed; +template +using FloatFromSize = typename detail::TypeFromSize::Float; + +// Avoid confusion with SizeTag where the parameter is a lane size. +using UnsignedTag = SizeTag<0>; +using SignedTag = SizeTag<0x100>; // integer +using FloatTag = SizeTag<0x200>; + +template > +constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> { + return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>(); +} + +// For when we only want to distinguish FloatTag from everything else. +using NonFloatTag = SizeTag<0x400>; + +template > +constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> { + return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>(); +} + +//------------------------------------------------------------------------------ +// Type traits + +template +HWY_API constexpr bool IsFloat() { + // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or + // from a float, not compared. + return IsSame() || IsSame(); +} + +template +HWY_API constexpr bool IsSigned() { + return T(0) > T(-1); +} +template <> +constexpr bool IsSigned() { + return true; +} +template <> +constexpr bool IsSigned() { + return true; +} + +// Largest/smallest representable integer values. +template +HWY_API constexpr T LimitsMax() { + static_assert(!IsFloat(), "Only for integer types"); + using TU = MakeUnsigned; + return static_cast(IsSigned() ? (static_cast(~0ull) >> 1) + : static_cast(~0ull)); +} +template +HWY_API constexpr T LimitsMin() { + static_assert(!IsFloat(), "Only for integer types"); + return IsSigned() ? T(-1) - LimitsMax() : T(0); +} + +// Largest/smallest representable value (integer or float). This naming avoids +// confusion with numeric_limits::min() (the smallest positive value). +template +HWY_API constexpr T LowestValue() { + return LimitsMin(); +} +template <> +constexpr float LowestValue() { + return -3.402823466e+38F; +} +template <> +constexpr double LowestValue() { + return -1.7976931348623158e+308; +} + +template +HWY_API constexpr T HighestValue() { + return LimitsMax(); +} +template <> +constexpr float HighestValue() { + return 3.402823466e+38F; +} +template <> +constexpr double HighestValue() { + return 1.7976931348623158e+308; +} + +// Difference between 1.0 and the next representable value. +template +HWY_API constexpr T Epsilon() { + return 1; +} +template <> +constexpr float Epsilon() { + return 1.192092896e-7f; +} +template <> +constexpr double Epsilon() { + return 2.2204460492503131e-16; +} + +// Returns width in bits of the mantissa field in IEEE binary32/64. +template +constexpr int MantissaBits() { + static_assert(sizeof(T) == 0, "Only instantiate the specializations"); + return 0; +} +template <> +constexpr int MantissaBits() { + return 23; +} +template <> +constexpr int MantissaBits() { + return 52; +} + +// Returns the (left-shifted by one bit) IEEE binary32/64 representation with +// the largest possible (biased) exponent field. Used by IsInf. +template +constexpr MakeSigned MaxExponentTimes2() { + return -(MakeSigned{1} << (MantissaBits() + 1)); +} + +// Returns bitmask of the sign bit in IEEE binary32/64. +template +constexpr MakeUnsigned SignMask() { + return MakeUnsigned{1} << (sizeof(T) * 8 - 1); +} + +// Returns bitmask of the exponent field in IEEE binary32/64. +template +constexpr MakeUnsigned ExponentMask() { + return (~(MakeUnsigned{1} << MantissaBits()) + 1) & ~SignMask(); +} + +// Returns bitmask of the mantissa field in IEEE binary32/64. +template +constexpr MakeUnsigned MantissaMask() { + return (MakeUnsigned{1} << MantissaBits()) - 1; +} + +// Returns 1 << mantissa_bits as a floating-point number. All integers whose +// absolute value are less than this can be represented exactly. +template +constexpr T MantissaEnd() { + static_assert(sizeof(T) == 0, "Only instantiate the specializations"); + return 0; +} +template <> +constexpr float MantissaEnd() { + return 8388608.0f; // 1 << 23 +} +template <> +constexpr double MantissaEnd() { + // floating point literal with p52 requires C++17. + return 4503599627370496.0; // 1 << 52 +} + +// Returns width in bits of the exponent field in IEEE binary32/64. +template +constexpr int ExponentBits() { + // Exponent := remaining bits after deducting sign and mantissa. + return 8 * sizeof(T) - 1 - MantissaBits(); +} + +// Returns largest value of the biased exponent field in IEEE binary32/64, +// right-shifted so that the LSB is bit zero. Example: 0xFF for float. +// This is expressed as a signed integer for more efficient comparison. +template +constexpr MakeSigned MaxExponentField() { + return (MakeSigned{1} << ExponentBits()) - 1; +} + +//------------------------------------------------------------------------------ +// Helper functions + +template +constexpr inline T1 DivCeil(T1 a, T2 b) { + return (a + b - 1) / b; +} + +// Works for any `align`; if a power of two, compiler emits ADD+AND. +constexpr inline size_t RoundUpTo(size_t what, size_t align) { + return DivCeil(what, align) * align; +} + +// Undefined results for x == 0. +HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { +#if HWY_COMPILER_MSVC + unsigned long index; // NOLINT + _BitScanForward(&index, x); + return index; +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_ctz(x)); +#endif // HWY_COMPILER_MSVC +} + +HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) { +#if HWY_COMPILER_MSVC +#if HWY_ARCH_X86_64 + unsigned long index; // NOLINT + _BitScanForward64(&index, x); + return index; +#else // HWY_ARCH_X86_64 + // _BitScanForward64 not available + uint32_t lsb = static_cast(x & 0xFFFFFFFF); + unsigned long index; // NOLINT + if (lsb == 0) { + uint32_t msb = static_cast(x >> 32u); + _BitScanForward(&index, msb); + return 32 + index; + } else { + _BitScanForward(&index, lsb); + return index; + } +#endif // HWY_ARCH_X86_64 +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_ctzll(x)); +#endif // HWY_COMPILER_MSVC +} + +// Undefined results for x == 0. +HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) { +#if HWY_COMPILER_MSVC + unsigned long index; // NOLINT + _BitScanReverse(&index, x); + return 31 - index; +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_clz(x)); +#endif // HWY_COMPILER_MSVC +} + +HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) { +#if HWY_COMPILER_MSVC +#if HWY_ARCH_X86_64 + unsigned long index; // NOLINT + _BitScanReverse64(&index, x); + return 63 - index; +#else // HWY_ARCH_X86_64 + // _BitScanReverse64 not available + const uint32_t msb = static_cast(x >> 32u); + unsigned long index; // NOLINT + if (msb == 0) { + const uint32_t lsb = static_cast(x & 0xFFFFFFFF); + _BitScanReverse(&index, lsb); + return 63 - index; + } else { + _BitScanReverse(&index, msb); + return 31 - index; + } +#endif // HWY_ARCH_X86_64 +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_clzll(x)); +#endif // HWY_COMPILER_MSVC +} + +HWY_API size_t PopCount(uint64_t x) { +#if HWY_COMPILER_GCC // includes clang + return static_cast(__builtin_popcountll(x)); + // This instruction has a separate feature flag, but is often called from + // non-SIMD code, so we don't want to require dynamic dispatch. It was first + // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro + // for AVX, so check for that. +#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__) + return _mm_popcnt_u64(x); +#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) + return _mm_popcnt_u32(static_cast(x & 0xFFFFFFFFu)) + + _mm_popcnt_u32(static_cast(x >> 32)); +#else + x -= ((x >> 1) & 0x5555555555555555ULL); + x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL)); + x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL); + x += (x >> 8); + x += (x >> 16); + x += (x >> 32); + return static_cast(x & 0x7Fu); +#endif +} + +// Skip HWY_API due to GCC "function not considered for inlining". Previously +// such errors were caused by underlying type mismatches, but it's not clear +// what is still mismatched despite all the casts. +template +/*HWY_API*/ constexpr size_t FloorLog2(TI x) { + return x == TI{1} + ? 0 + : static_cast(FloorLog2(static_cast(x >> 1)) + 1); +} + +template +/*HWY_API*/ constexpr size_t CeilLog2(TI x) { + return x == TI{1} + ? 0 + : static_cast(FloorLog2(static_cast(x - 1)) + 1); +} + +#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 +#pragma intrinsic(_umul128) +#endif + +// 64 x 64 = 128 bit multiplication +HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { +#if defined(__SIZEOF_INT128__) + __uint128_t product = (__uint128_t)a * (__uint128_t)b; + *upper = (uint64_t)(product >> 64); + return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL); +#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 + return _umul128(a, b, upper); +#else + constexpr uint64_t kLo32 = 0xFFFFFFFFU; + const uint64_t lo_lo = (a & kLo32) * (b & kLo32); + const uint64_t hi_lo = (a >> 32) * (b & kLo32); + const uint64_t lo_hi = (a & kLo32) * (b >> 32); + const uint64_t hi_hi = (a >> 32) * (b >> 32); + const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi; + *upper = (hi_lo >> 32) + (t >> 32) + hi_hi; + return (t << 32) | (lo_lo & kLo32); +#endif +} + +#if HWY_COMPILER_MSVC +#pragma intrinsic(memcpy) +#pragma intrinsic(memset) +#endif + +// The source/destination must not overlap/alias. +template +HWY_API void CopyBytes(const From* from, To* to) { +#if HWY_COMPILER_MSVC + memcpy(to, from, kBytes); +#else + __builtin_memcpy( + static_cast(to), static_cast(from), kBytes); +#endif +} + +// Same as CopyBytes, but for same-sized objects; avoids a size argument. +template +HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) { + static_assert(sizeof(From) == sizeof(To), ""); + CopyBytes(from, to); +} + +template +HWY_API void ZeroBytes(To* to) { +#if HWY_COMPILER_MSVC + memset(to, 0, kBytes); +#else + __builtin_memset(to, 0, kBytes); +#endif +} + +HWY_API float F32FromBF16(bfloat16_t bf) { + uint32_t bits = bf.bits; + bits <<= 16; + float f; + CopySameSize(&bits, &f); + return f; +} + +HWY_API bfloat16_t BF16FromF32(float f) { + uint32_t bits; + CopySameSize(&f, &bits); + bfloat16_t bf; + bf.bits = static_cast(bits >> 16); + return bf; +} + +HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) + Abort(const char* file, int line, const char* format, ...); + +} // namespace hwy + +#endif // HIGHWAY_HWY_BASE_H_ diff --git a/hwy/base_test.cc b/hwy/base_test.cc new file mode 100644 index 0000000..baca70b --- /dev/null +++ b/hwy/base_test.cc @@ -0,0 +1,178 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "base_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +HWY_NOINLINE void TestAllLimits() { + HWY_ASSERT_EQ(uint8_t{0}, LimitsMin()); + HWY_ASSERT_EQ(uint16_t{0}, LimitsMin()); + HWY_ASSERT_EQ(uint32_t{0}, LimitsMin()); + HWY_ASSERT_EQ(uint64_t{0}, LimitsMin()); + + HWY_ASSERT_EQ(int8_t{-128}, LimitsMin()); + HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin()); + HWY_ASSERT_EQ(static_cast(0x80000000u), LimitsMin()); + HWY_ASSERT_EQ(static_cast(0x8000000000000000ull), + LimitsMin()); + + HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax()); + HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax()); + HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax()); + HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax()); + + HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax()); + HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax()); + HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax()); + HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax()); +} + +struct TestLowestHighest { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + HWY_ASSERT_EQ(std::numeric_limits::lowest(), LowestValue()); + HWY_ASSERT_EQ(std::numeric_limits::max(), HighestValue()); + } +}; + +HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); } +struct TestIsUnsigned { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(!IsFloat(), "Expected !IsFloat"); + static_assert(!IsSigned(), "Expected !IsSigned"); + } +}; + +struct TestIsSigned { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(!IsFloat(), "Expected !IsFloat"); + static_assert(IsSigned(), "Expected IsSigned"); + } +}; + +struct TestIsFloat { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(IsFloat(), "Expected IsFloat"); + static_assert(IsSigned(), "Floats are also considered signed"); + } +}; + +HWY_NOINLINE void TestAllType() { + ForUnsignedTypes(TestIsUnsigned()); + ForSignedTypes(TestIsSigned()); + ForFloatTypes(TestIsFloat()); + + static_assert(sizeof(MakeUnsigned) == 16, ""); + static_assert(sizeof(MakeWide) == 16, "Expected uint128_t"); + static_assert(sizeof(MakeNarrow) == 8, "Expected uint64_t"); +} + +struct TestIsSame { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(IsSame(), "T == T"); + static_assert(!IsSame, MakeUnsigned>(), "S != U"); + static_assert(!IsSame, MakeSigned>(), "U != S"); + } +}; + +HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); } + +HWY_NOINLINE void TestAllBitScan() { + HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u)); + HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u)); + HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u)); + HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u)); + HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u)); + + HWY_ASSERT_EQ(size_t{0}, + Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull)); + HWY_ASSERT_EQ(size_t{0}, + Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull)); + HWY_ASSERT_EQ(size_t{1}, + Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull)); + HWY_ASSERT_EQ(size_t{1}, + Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull)); + HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull)); + HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull)); + HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull)); + + HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u)); + HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u)); + HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u)); + + HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull)); + HWY_ASSERT_EQ(size_t{62}, + Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull)); + HWY_ASSERT_EQ(size_t{63}, + Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull)); +} + +HWY_NOINLINE void TestAllPopCount() { + HWY_ASSERT_EQ(size_t{0}, PopCount(0u)); + HWY_ASSERT_EQ(size_t{1}, PopCount(1u)); + HWY_ASSERT_EQ(size_t{1}, PopCount(2u)); + HWY_ASSERT_EQ(size_t{2}, PopCount(3u)); + HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u)); + HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu)); + HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu)); + + HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull)); + HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull)); + HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull)); + HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull)); + HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull)); + HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(BaseTest); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount); +} // namespace hwy + +#endif diff --git a/hwy/cache_control.h b/hwy/cache_control.h new file mode 100644 index 0000000..b124e57 --- /dev/null +++ b/hwy/cache_control.h @@ -0,0 +1,110 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_ +#define HIGHWAY_HWY_CACHE_CONTROL_H_ + +#include +#include + +#include "hwy/base.h" + +// Requires SSE2; fails to compile on 32-bit Clang 7 (see +// https://github.com/gperftools/gperftools/issues/946). +#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32) +#undef HWY_DISABLE_CACHE_CONTROL +#define HWY_DISABLE_CACHE_CONTROL +#endif + +// intrin.h is sufficient on MSVC and already included by base.h. +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC +#include // SSE2 +#endif + +// Windows.h #defines these, which causes infinite recursion. Temporarily +// undefine them in this header; these functions are anyway deprecated. +// TODO(janwas): remove when these functions are removed. +#pragma push_macro("LoadFence") +#undef LoadFence + +namespace hwy { + +// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size. +#define HWY_STREAM_MULTIPLE 16 + +// The following functions may also require an attribute. +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC +#define HWY_ATTR_CACHE __attribute__((target("sse2"))) +#else +#define HWY_ATTR_CACHE +#endif + +// Delays subsequent loads until prior loads are visible. Beware of potentially +// differing behavior across architectures and vendors: on Intel but not +// AMD CPUs, also serves as a full fence (waits for all prior instructions to +// complete). +HWY_INLINE HWY_ATTR_CACHE void LoadFence() { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_lfence(); +#endif +} + +// Ensures values written by previous `Stream` calls are visible on the current +// core. This is NOT sufficient for synchronizing across cores; when `Stream` +// outputs are to be consumed by other core(s), the producer must publish +// availability (e.g. via mutex or atomic_flag) after `FlushStream`. +HWY_INLINE HWY_ATTR_CACHE void FlushStream() { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_sfence(); +#endif +} + +// Optionally begins loading the cache line containing "p" to reduce latency of +// subsequent actual loads. +template +HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_prefetch(reinterpret_cast(p), _MM_HINT_T0); +#elif HWY_COMPILER_GCC // includes clang + // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not + // desirable, so use the default 3 (keep in caches). + __builtin_prefetch(p, /*write=*/0, /*hint=*/3); +#else + (void)p; +#endif +} + +// Invalidates and flushes the cache line containing "p", if possible. +HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_clflush(p); +#else + (void)p; +#endif +} + +// When called inside a spin-loop, may reduce power consumption. +HWY_INLINE HWY_ATTR_CACHE void Pause() { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_pause(); +#endif +} + +} // namespace hwy + +// TODO(janwas): remove when these functions are removed. (See above.) +#pragma pop_macro("LoadFence") + +#endif // HIGHWAY_HWY_CACHE_CONTROL_H_ diff --git a/hwy/contrib/algo/copy-inl.h b/hwy/contrib/algo/copy-inl.h new file mode 100644 index 0000000..033cf8a --- /dev/null +++ b/hwy/contrib/algo/copy-inl.h @@ -0,0 +1,136 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a CopyAlignedPadded because it +// would be more verbose than such a loop. + +// Fills `to`[0, `count`) with `value`. +template > +void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + const Vec v = Set(d, value); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeFillN(remaining, value, d, to + idx); +} + +// Copies `from`[0, `count`) to `to`, which must not overlap `from`. +template > +void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, from + idx); + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeCopyN(remaining, d, from + idx, to + idx); +} + +// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the +// corresponding mask element of `func(d, v)` is true. Returns the STL-style end +// of the newly written elements in `to`. +// +// `func` is either a functor with a templated operator()(d, v) returning a +// mask, or a generic lambda if using C++14. Due to apparent limitations of +// Clang on Windows, it is currently necessary to add HWY_ATTR before the +// opening { of the lambda to avoid errors about "function .. requires target". +// +// NOTE: this is only supported for 16-, 32- or 64-bit types. +// NOTE: Func may be called a second time for elements it has already seen, but +// these elements will not be written to `to` again. +template > +T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, from + idx); + to += CompressBlendedStore(v, func(d, v), d, to); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return to; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + // Workaround for -Waggressive-loop-optimizations on GCC 8 + // (iteration 2305843009213693951 invokes undefined behavior for T=i64) + const uintptr_t addr = reinterpret_cast(from); + const T* HWY_RESTRICT from_idx = + reinterpret_cast(addr + (idx * sizeof(T))); + const V1 v = LoadU(d1, from_idx); + // Avoid storing to `to` unless we know it should be kept - otherwise, we + // might overrun the end if it was allocated for the exact count. + if (CountTrue(d1, func(d1, v)) == 0) continue; + StoreU(v, d1, to); + to += 1; + } +#else + // Start index of the last unaligned whole vector, ending at the array end. + const size_t last = count - N; + // Number of elements before `from` or already written. + const size_t invalid = idx - last; + HWY_DASSERT(0 != invalid && invalid < N); + const Mask mask = Not(FirstN(d, invalid)); + const Vec v = MaskedLoad(mask, d, from + last); + to += CompressBlendedStore(v, And(mask, func(d, v)), d, to); +#endif + return to; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ diff --git a/hwy/contrib/algo/copy_test.cc b/hwy/contrib/algo/copy_test.cc new file mode 100644 index 0000000..e2675a3 --- /dev/null +++ b/hwy/contrib/algo/copy_test.cc @@ -0,0 +1,199 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/algo/copy-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random integer in [0, 128), which fits in any lane type. +template +T Random7Bit(RandomState& rng) { + return static_cast(Random32(&rng) & 127); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +struct IsOdd { + template + Mask operator()(D d, V v) const { + return TestBit(v, Set(d, TFromD{1})); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from +// ForFloatTypes. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +struct TestFill { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // HWY_MAX prevents error when misalign == count == 0. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* expected = pa.get() + misalign_a; + const T value = Random7Bit(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = value; + } + AlignedFreeUniquePtr pb = AllocateAligned(misalign_b + count + 1); + T* actual = pb.get() + misalign_b; + + actual[count] = T{0}; // sentinel + Fill(d, value, count, actual); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllFill() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestCopy { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit(rng); + } + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + T* b = pb.get() + misalign_b; + + Copy(d, a, count, b); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__, + __LINE__); + } +}; + +void TestAllCopy() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestCopyIf { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit(rng); + } + const size_t padding = Lanes(ScalableTag()); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count + padding)); + T* b = pb.get() + misalign_b; + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + size_t num_odd = 0; + for (size_t i = 0; i < count; ++i) { + if (a[i] & 1) { + expected[num_odd++] = a[i]; + } + } + +#if HWY_GENERIC_LAMBDA + const auto is_odd = [](const auto d, const auto v) HWY_ATTR { + return TestBit(v, Set(d, TFromD{1})); + }; +#else + const IsOdd is_odd; +#endif + T* end = CopyIf(d, a, count, b, is_odd); + const size_t num_written = static_cast(end - b); + HWY_ASSERT_EQ(num_odd, num_written); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllCopyIf() { + ForUI163264(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(CopyTest); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf); +} // namespace hwy + +#endif diff --git a/hwy/contrib/algo/find-inl.h b/hwy/contrib/algo/find-inl.h new file mode 100644 index 0000000..388842e --- /dev/null +++ b/hwy/contrib/algo/find-inl.h @@ -0,0 +1,109 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns index of the first element equal to `value` in `in[0, count)`, or +// `count` if not found. +template > +size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) { + const size_t N = Lanes(d); + const Vec broadcasted = Set(d, value); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag d1; + using V1 = Vec; + const V1 broadcasted1 = Set(d1, GetLane(broadcasted)); + for (; i < count; ++i) { + if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask)); + if (pos >= 0) return i + static_cast(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// Returns index of the first element in `in[0, count)` for which `func(d, vec)` +// returns true, otherwise `count`. +template > +size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag d1; + for (; i < count; ++i) { + if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask)); + if (pos >= 0) return i + static_cast(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ diff --git a/hwy/contrib/algo/find_test.cc b/hwy/contrib/algo/find_test.cc new file mode 100644 index 0000000..da13c47 --- /dev/null +++ b/hwy/contrib/algo/find_test.cc @@ -0,0 +1,219 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/print.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/algo/find-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to FindIf, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random number in [-8, 8) - we use knowledge of the range to Find() +// values we know are not present. +template +T Random(RandomState& rng) { + const int32_t bits = static_cast(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast(HWY_MAX(hwy::LowestValue(), val)); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +class GreaterThan { + public: + GreaterThan(int val) : val_(val) {} + template + Mask operator()(D d, V v) const { + return Gt(v, Set(d, static_cast>(val_))); + } + + private: + int val_; +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestFind) with all arg combinations. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + // Find() checks 8 vectors at a time, so we want to cover a fairly large + // range without oversampling (checking every possible count). + std::vector counts(AdjustedReps(512)); + for (size_t& count : counts) { + count = static_cast(rng()) % (16 * N + 1); + } + counts[0] = 0; // ensure we test count=0. + + for (size_t count : counts) { + for (size_t m : misalignments) { + Test()(d, count, m, rng); + } + } + } +}; + +struct TestFind { + template + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr storage = + AllocateAligned(HWY_MAX(1, misalign + count)); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random(rng); + } + + // For each position, search for that element (which we know is there) + for (size_t pos = 0; pos < count; ++pos) { + const size_t actual = Find(d, in[pos], in, count); + + // We may have found an earlier occurrence of the same value; ensure the + // value is the same, and that it is the first. + if (!IsEqual(in[pos], in[actual])) { + fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + static_cast(in[actual]), static_cast(actual), + static_cast(in[pos])); + HWY_ASSERT(false); + } + for (size_t i = 0; i < actual; ++i) { + if (IsEqual(in[i], in[pos])) { + fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + static_cast(in[i]), static_cast(i), + static_cast(actual)); + HWY_ASSERT(false); + } + } + } + + // Also search for values we know not to be present (out of range) + HWY_ASSERT_EQ(count, Find(d, T{9}, in, count)); + HWY_ASSERT_EQ(count, Find(d, static_cast(-9), in, count)); + } +}; + +void TestAllFind() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestFindIf { + template + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD; + using TI = MakeSigned; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr storage = + AllocateAligned(HWY_MAX(1, misalign + count)); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random(rng); + HWY_ASSERT(in[i] < 8); + HWY_ASSERT(!hwy::IsSigned() || static_cast(in[i]) >= -8); + } + + bool found_any = false; + bool not_found_any = false; + + // unsigned T would be promoted to signed and compare greater than any + // negative val, whereas Set() would just cast to an unsigned value and the + // comparison remains unsigned, so avoid negative numbers there. + const int min_val = IsSigned() ? -9 : 0; + // Includes out-of-range value 9 to test the not-found path. + for (int val = min_val; val <= 9; ++val) { +#if HWY_GENERIC_LAMBDA + const auto greater = [val](const auto d, const auto v) HWY_ATTR { + return Gt(v, Set(d, static_cast(val))); + }; +#else + const GreaterThan greater(val); +#endif + const size_t actual = FindIf(d, in, count, greater); + found_any |= actual < count; + not_found_any |= actual == count; + + const auto pos = std::find_if( + in, in + count, [val](T x) { return x > static_cast(val); }); + // Convert returned iterator to index. + const size_t expected = static_cast(pos - in); + if (expected != actual) { + fprintf(stderr, "%s count %d val %d, expected %d actual %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + val, static_cast(expected), static_cast(actual)); + hwy::detail::PrintArray(hwy::detail::MakeTypeInfo(), "in", in, count, + 0, count); + HWY_ASSERT(false); + } + } + + // We will always not-find something due to val=9. + HWY_ASSERT(not_found_any); + // We'll find something unless the input is empty or {0} - because 0 > i + // is false for all i=[0,9]. + if (count != 0 && in[0] != 0) { + HWY_ASSERT(found_any); + } + } +}; + +void TestAllFindIf() { + ForAllTypes(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(FindTest); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf); +} // namespace hwy + +#endif diff --git a/hwy/contrib/algo/transform-inl.h b/hwy/contrib/algo/transform-inl.h new file mode 100644 index 0000000..3e830ac --- /dev/null +++ b/hwy/contrib/algo/transform-inl.h @@ -0,0 +1,262 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a TransformAlignedPadded because it +// would be more verbose than such a loop. +// +// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a +// generic lambda if using C++14. Due to apparent limitations of Clang on +// Windows, it is currently necessary to add HWY_ATTR before the opening { of +// the lambda to avoid errors about "always_inline function .. requires target". +// +// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise, +// we used `MaskedLoad` and `BlendedStore` to read/write the final partial +// vector. + +// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`, +// where `index_vec` is `Vec>`. On the first call to `func`, +// the value of its lane i is i, and increases by `Lanes(d)` after every call. +// Note that some of these indices may be `>= count`, but the elements that +// `func` returns in those lanes will not be written to `out`. +template > +void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) { + const RebindToUnsigned du; + using TU = TFromD; + const size_t N = Lanes(d); + + size_t idx = 0; + Vec vidx = Iota(du, 0); + for (; idx + N <= count; idx += N) { + StoreU(func(d, vidx), d, out + idx); + vidx = Add(vidx, Set(du, static_cast(N))); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const RebindToUnsigned du1; + for (; idx < count; ++idx) { + StoreU(func(d1, Set(du1, static_cast(idx))), d1, out + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + BlendedStore(func(d, vidx), mask, d, out + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying +// array elements by a constant. +template > +void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + StoreU(func(d, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + StoreU(func(d1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(func(d, v), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage: +// multiplying array elements by those of another array. +template > +void Transform1(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + const Vec v1 = LoadU(d, in1 + idx); + StoreU(func(d, v, v1), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + StoreU(func(d1, v, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + const Vec v1 = MaskedLoad(mask, d, in1 + idx); + BlendedStore(func(d, v, v1), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example +// usage: FMA of elements from three arrays, stored into the first array. +template > +void Transform2(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + const Vec v1 = LoadU(d, in1 + idx); + const Vec v2 = LoadU(d, in2 + idx); + StoreU(func(d, v, v1, v2), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + const V1 v2 = LoadU(d1, in2 + idx); + StoreU(func(d1, v, v1, v2), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + const Vec v1 = MaskedLoad(mask, d, in1 + idx); + const Vec v2 = MaskedLoad(mask, d, in2 + idx); + BlendedStore(func(d, v, v1, v2), mask, d, inout + idx); +#endif +} + +template > +void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) { + const size_t N = Lanes(d); + const Vec old_v = Set(d, old_t); + const Vec new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec v = LoadU(d, inout + idx); + StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const Vec old_v1 = Set(d1, old_t); + const Vec new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v1 = LoadU(d1, inout + idx); + StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx); +#endif +} + +template > +void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t, + const Func& func) { + const size_t N = Lanes(d); + const Vec new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec v = LoadU(d, inout + idx); + StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const Vec new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ diff --git a/hwy/contrib/algo/transform_test.cc b/hwy/contrib/algo/transform_test.cc new file mode 100644 index 0000000..335607c --- /dev/null +++ b/hwy/contrib/algo/transform_test.cc @@ -0,0 +1,372 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // memcpy + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" //NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/algo/transform-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +T Alpha() { + return static_cast(1.5); // arbitrary scalar +} + +// Returns random floating-point number in [-8, 8) to ensure computations do +// not exceed float32 precision. +template +T Random(RandomState& rng) { + const int32_t bits = static_cast(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast(HWY_MAX(hwy::LowestValue(), val)); +} + +// SCAL, AXPY names are from BLAS. +template +HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha() * x[i]; + } +} + +template +HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha() * x[i] + y[i]; + } +} + +template +HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out, + size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = x[i] * y[i] + z[i]; + } +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +// Generator that returns even numbers by doubling the output indices. +struct Gen2 { + template + Vec operator()(D d, VU vidx) const { + return BitCast(d, Add(vidx, vidx)); + } +}; + +struct SCAL { + template + Vec operator()(D d, V v) const { + using T = TFromD; + return Mul(Set(d, Alpha()), v); + } +}; + +struct AXPY { + template + Vec operator()(D d, V v, V v1) const { + using T = TFromD; + return MulAdd(Set(d, Alpha()), v, v1); + } +}; + +struct FMA4 { + template + Vec operator()(D /*d*/, V v, V v1, V v2) const { + return MulAdd(v, v1, v2); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from +// ForFloatTypes. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +// Output-only, no loads +struct TestGenerate { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + using T = TFromD; + AlignedFreeUniquePtr pa = AllocateAligned(misalign_a + count + 1); + T* actual = pa.get() + misalign_a; + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + for (size_t i = 0; i < count; ++i) { + expected[i] = static_cast(2 * i); + } + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto gen2 = [](const auto d, const auto vidx) + HWY_ATTR { return BitCast(d, Add(vidx, vidx)); }; +#else + const Gen2 gen2; +#endif + actual[count] = T{0}; // sentinel + Generate(d, actual, count, gen2); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), actual, count, + target_name, __FILE__, __LINE__); + } +}; + +// Zero extra input arrays +struct TestTransform { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + } + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + SimpleSCAL(a, expected.get(), count); + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto scal = [](const auto d, const auto v) + HWY_ATTR { return Mul(Set(d, Alpha()), v); }; +#else + const SCAL scal; +#endif + Transform(d, a, count, scal); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// One extra input array +struct TestTransform1 { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + b[i] = Random(rng); + } + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + SimpleAXPY(a, b, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR { + return MulAdd(Set(d, Alpha()), v, v1); + }; +#else + const AXPY axpy; +#endif + Transform1(d, a, count, b, axpy); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// Two extra input arrays +struct TestTransform2 { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + AlignedFreeUniquePtr pc = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + T* c = pc.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + b[i] = Random(rng); + c[i] = Random(rng); + } + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + SimpleFMA4(a, b, c, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2) + HWY_ATTR { return MulAdd(v, v1, v2); }; +#else + const FMA4 fma4; +#endif + Transform2(d, a, count, b, c, fma4); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +template +class IfEq { + public: + IfEq(T val) : val_(val) {} + + template + Mask operator()(D d, V v) const { + return Eq(v, Set(d, val_)); + } + + private: + T val_; +}; + +struct TestReplace { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + if (count == 0) return; + using T = TFromD; + AlignedFreeUniquePtr pa = AllocateAligned(misalign_a + count); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + } + AlignedFreeUniquePtr pb = AllocateAligned(count); + + AlignedFreeUniquePtr expected = AllocateAligned(count); + + std::vector positions(AdjustedReps(count)); + for (size_t& pos : positions) { + pos = static_cast(rng()) % count; + } + + for (size_t pos = 0; pos < count; ++pos) { + const T old_t = a[pos]; + const T new_t = Random(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = IsEqual(a[i], old_t) ? new_t : a[i]; + } + + // Copy so ReplaceIf gets the same input (and thus also outputs expected) + memcpy(pb.get(), a, count * sizeof(T)); + + Replace(d, a, count, new_t, old_t); + HWY_ASSERT_ARRAY_EQ(expected.get(), a, count); + + ReplaceIf(d, pb.get(), count, new_t, IfEq(old_t)); + HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count); + } + } +}; + +void TestAllGenerate() { + // The test BitCast-s the indices, which does not work for floats. + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllTransform() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllTransform1() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllTransform2() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllReplace() { + ForFloatTypes(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(TransformTest); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace); +} // namespace hwy + +#endif diff --git a/hwy/contrib/dot/dot-inl.h b/hwy/contrib/dot/dot-inl.h new file mode 100644 index 0000000..e04636f --- /dev/null +++ b/hwy/contrib/dot/dot-inl.h @@ -0,0 +1,252 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Include guard (still compiled once per target) +#include + +#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct Dot { + // Specify zero or more of these, ORed together, as the kAssumptions template + // argument to Compute. Each one may improve performance or reduce code size, + // at the cost of additional requirements on the arguments. + enum Assumptions { + // num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T). + kAtLeastOneVector = 1, + // num_elements is divisible by N (a power of two, so this can be used if + // the problem size is known to be a power of two >= HWY_MAX_BYTES / + // sizeof(T)). + kMultipleOfVector = 2, + // RoundUpTo(num_elements, N) elements are accessible; their value does not + // matter (will be treated as if they were zero). + kPaddedToVector = 4, + }; + + // Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the + // pointers to a multiple of N elements is helpful but not required. + template , + HWY_IF_NOT_LANE_SIZE_D(D, 2)> + static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa, + const T* const HWY_RESTRICT pb, + const size_t num_elements) { + static_assert(IsFloat(), "MulAdd requires float type"); + using V = decltype(Zero(d)); + + const size_t N = Lanes(d); + size_t i = 0; + + constexpr bool kIsAtLeastOneVector = + (kAssumptions & kAtLeastOneVector) != 0; + constexpr bool kIsMultipleOfVector = + (kAssumptions & kMultipleOfVector) != 0; + constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0; + + // Won't be able to do a full vector load without padding => scalar loop. + if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector && + HWY_UNLIKELY(num_elements < N)) { + // Only 2x unroll to avoid excessive code size. + T sum0 = T(0); + T sum1 = T(0); + for (; i + 2 <= num_elements; i += 2) { + sum0 += pa[i + 0] * pb[i + 0]; + sum1 += pa[i + 1] * pb[i + 1]; + } + if (i < num_elements) { + sum1 += pa[i] * pb[i]; + } + return sum0 + sum1; + } + + // Compiler doesn't make independent sum* accumulators, so unroll manually. + // 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive + // for unaligned inputs (each unaligned pointer halves the throughput + // because it occupies both L1 load ports for a cycle). We cannot have + // arrays of vectors on RVV/SVE, so always unroll 4x. + V sum0 = Zero(d); + V sum1 = Zero(d); + V sum2 = Zero(d); + V sum3 = Zero(d); + + // Main loop: unrolled + for (; i + 4 * N <= num_elements; /* i += 4 * N */) { // incr in loop + const auto a0 = LoadU(d, pa + i); + const auto b0 = LoadU(d, pb + i); + i += N; + sum0 = MulAdd(a0, b0, sum0); + const auto a1 = LoadU(d, pa + i); + const auto b1 = LoadU(d, pb + i); + i += N; + sum1 = MulAdd(a1, b1, sum1); + const auto a2 = LoadU(d, pa + i); + const auto b2 = LoadU(d, pb + i); + i += N; + sum2 = MulAdd(a2, b2, sum2); + const auto a3 = LoadU(d, pa + i); + const auto b3 = LoadU(d, pb + i); + i += N; + sum3 = MulAdd(a3, b3, sum3); + } + + // Up to 3 iterations of whole vectors + for (; i + N <= num_elements; i += N) { + const auto a = LoadU(d, pa + i); + const auto b = LoadU(d, pb + i); + sum0 = MulAdd(a, b, sum0); + } + + if (!kIsMultipleOfVector) { + const size_t remaining = num_elements - i; + if (remaining != 0) { + if (kIsPaddedToVector) { + const auto mask = FirstN(d, remaining); + const auto a = LoadU(d, pa + i); + const auto b = LoadU(d, pb + i); + sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1); + } else { + // Unaligned load such that the last element is in the highest lane - + // ensures we do not touch any elements outside the valid range. + // If we get here, then num_elements >= N. + HWY_DASSERT(i >= N); + i += remaining - N; + const auto skip = FirstN(d, N - remaining); + const auto a = LoadU(d, pa + i); // always unaligned + const auto b = LoadU(d, pb + i); + sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1); + } + } + } // kMultipleOfVector + + // Reduction tree: sum of all accumulators by pairs, then across lanes. + sum0 = Add(sum0, sum1); + sum2 = Add(sum2, sum3); + sum0 = Add(sum0, sum2); + return GetLane(SumOfLanes(d, sum0)); + } + + // Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a + // multiple of N elements is helpful but not required. + template + static HWY_INLINE float Compute(const D d, + const bfloat16_t* const HWY_RESTRICT pa, + const bfloat16_t* const HWY_RESTRICT pb, + const size_t num_elements) { + const RebindToUnsigned du16; + const Repartition df32; + + using V = decltype(Zero(df32)); + const size_t N = Lanes(d); + size_t i = 0; + + constexpr bool kIsAtLeastOneVector = + (kAssumptions & kAtLeastOneVector) != 0; + constexpr bool kIsMultipleOfVector = + (kAssumptions & kMultipleOfVector) != 0; + constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0; + + // Won't be able to do a full vector load without padding => scalar loop. + if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector && + HWY_UNLIKELY(num_elements < N)) { + float sum0 = 0.0f; // Only 2x unroll to avoid excessive code size for.. + float sum1 = 0.0f; // this unlikely(?) case. + for (; i + 2 <= num_elements; i += 2) { + sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]); + sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]); + } + if (i < num_elements) { + sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]); + } + return sum0 + sum1; + } + + // See comment in the other Compute() overload. Unroll 2x, but we need + // twice as many sums for ReorderWidenMulAccumulate. + V sum0 = Zero(df32); + V sum1 = Zero(df32); + V sum2 = Zero(df32); + V sum3 = Zero(df32); + + // Main loop: unrolled + for (; i + 2 * N <= num_elements; /* i += 2 * N */) { // incr in loop + const auto a0 = LoadU(d, pa + i); + const auto b0 = LoadU(d, pb + i); + i += N; + sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1); + const auto a1 = LoadU(d, pa + i); + const auto b1 = LoadU(d, pb + i); + i += N; + sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3); + } + + // Possibly one more iteration of whole vectors + if (i + N <= num_elements) { + const auto a0 = LoadU(d, pa + i); + const auto b0 = LoadU(d, pb + i); + i += N; + sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1); + } + + if (!kIsMultipleOfVector) { + const size_t remaining = num_elements - i; + if (remaining != 0) { + if (kIsPaddedToVector) { + const auto mask = FirstN(du16, remaining); + const auto va = LoadU(d, pa + i); + const auto vb = LoadU(d, pb + i); + const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va))); + const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb))); + sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3); + + } else { + // Unaligned load such that the last element is in the highest lane - + // ensures we do not touch any elements outside the valid range. + // If we get here, then num_elements >= N. + HWY_DASSERT(i >= N); + i += remaining - N; + const auto skip = FirstN(du16, N - remaining); + const auto va = LoadU(d, pa + i); // always unaligned + const auto vb = LoadU(d, pb + i); + const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va))); + const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb))); + sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3); + } + } + } // kMultipleOfVector + + // Reduction tree: sum of all accumulators by pairs, then across lanes. + sum0 = Add(sum0, sum1); + sum2 = Add(sum2, sum3); + sum0 = Add(sum0, sum2); + return GetLane(SumOfLanes(df32, sum0)); + } +}; + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ diff --git a/hwy/contrib/dot/dot_test.cc b/hwy/contrib/dot/dot_test.cc new file mode 100644 index 0000000..12d7ab2 --- /dev/null +++ b/hwy/contrib/dot/dot_test.cc @@ -0,0 +1,167 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/dot/dot-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) { + double sum = 0.0; + for (size_t i = 0; i < num; ++i) { + sum += pa[i] * pb[i]; + } + return static_cast(sum); +} + +HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb, + size_t num) { + float sum = 0.0f; + for (size_t i = 0; i < num; ++i) { + sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]); + } + return sum; +} + +template +void SetValue(const float value, T* HWY_RESTRICT ptr) { + *ptr = static_cast(value); +} +void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) { + *ptr = BF16FromF32(value); +} + +class TestDot { + // Computes/verifies one dot product. + template + void Test(D d, size_t num, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + const size_t N = Lanes(d); + const auto random_t = [&rng]() { + const int32_t bits = static_cast(Random32(&rng)) & 1023; + return static_cast(bits - 512) * (1.0f / 64); + }; + + const size_t padded = + (kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num; + AlignedFreeUniquePtr pa = AllocateAligned(misalign_a + padded); + AlignedFreeUniquePtr pb = AllocateAligned(misalign_b + padded); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + size_t i = 0; + for (; i < num; ++i) { + SetValue(random_t(), a + i); + SetValue(random_t(), b + i); + } + // Fill padding with NaN - the values are not used, but avoids MSAN errors. + for (; i < padded; ++i) { + ScalableTag df1; + SetValue(GetLane(NaN(df1)), a + i); + SetValue(GetLane(NaN(df1)), b + i); + } + + const auto expected = SimpleDot(a, b, num); + const auto actual = Dot::Compute(d, a, b, num); + const auto max = static_cast(8 * 8 * num); + HWY_ASSERT(-max <= actual && actual <= max); + HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4); + } + + // Runs tests with various alignments. + template + void ForeachMisalign(D d, size_t num, RandomState& rng) { + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test(d, num, ma, mb, rng); + } + } + } + + // Runs tests with various lengths compatible with the given assumptions. + template + void ForeachCount(D d, RandomState& rng) { + const size_t N = Lanes(d); + const size_t counts[] = {1, + 3, + 7, + 16, + HWY_MAX(N / 2, 1), + HWY_MAX(2 * N / 3, 1), + N, + N + 1, + 4 * N / 3, + 3 * N, + 8 * N, + 8 * N + 2}; + for (size_t num : counts) { + if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue; + if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue; + ForeachMisalign(d, num, rng); + } + } + + public: + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + // All 8 combinations of the three length-related flags: + ForeachCount<0>(d, rng); + ForeachCount(d, rng); + ForeachCount(d, rng); + ForeachCount(d, rng); + ForeachCount(d, rng); + ForeachCount(d, rng); + ForeachCount(d, rng); + ForeachCount(d, rng); + } +}; + +void TestAllDot() { ForFloatTypes(ForPartialVectors()); } +void TestAllDotBF16() { ForShrinkableVectors()(bfloat16_t()); } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(DotTest); +HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot); +HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16); +} // namespace hwy + +#endif diff --git a/hwy/contrib/image/image.cc b/hwy/contrib/image/image.cc new file mode 100644 index 0000000..2bcdcd6 --- /dev/null +++ b/hwy/contrib/image/image.cc @@ -0,0 +1,145 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/image/image.h" + +#include // swap +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +size_t GetVectorSize() { return Lanes(ScalableTag()); } +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE + +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(GetVectorSize); // Local function. +} // namespace + +size_t ImageBase::VectorSize() { + // Do not cache result - must return the current value, which may be greater + // than the first call if it was subject to DisableTargets! + return HWY_DYNAMIC_DISPATCH(GetVectorSize)(); +} + +size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) { + const size_t vec_size = VectorSize(); + size_t valid_bytes = xsize * sizeof_t; + + // Allow unaligned accesses starting at the last valid value - this may raise + // msan errors unless the user calls InitializePaddingForUnalignedAccesses. + // Skip for the scalar case because no extra lanes will be loaded. + if (vec_size != 1) { + HWY_DASSERT(vec_size >= sizeof_t); + valid_bytes += vec_size - sizeof_t; + } + + // Round up to vector and cache line size. + const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT); + size_t bytes_per_row = RoundUpTo(valid_bytes, align); + + // During the lengthy window before writes are committed to memory, CPUs + // guard against read after write hazards by checking the address, but + // only the lower 11 bits. We avoid a false dependency between writes to + // consecutive rows by ensuring their sizes are not multiples of 2 KiB. + // Avoid2K prevents the same problem for the planes of an Image3. + if (bytes_per_row % HWY_ALIGNMENT == 0) { + bytes_per_row += align; + } + + HWY_DASSERT(bytes_per_row % align == 0); + return bytes_per_row; +} + +ImageBase::ImageBase(const size_t xsize, const size_t ysize, + const size_t sizeof_t) + : xsize_(static_cast(xsize)), + ysize_(static_cast(ysize)), + bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) { + HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8); + + bytes_per_row_ = 0; + // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate + // if nonzero, because "zero" bytes still have padding/bookkeeping overhead. + if (xsize != 0 && ysize != 0) { + bytes_per_row_ = BytesPerRow(xsize, sizeof_t); + bytes_ = AllocateAligned(bytes_per_row_ * ysize); + HWY_ASSERT(bytes_.get() != nullptr); + InitializePadding(sizeof_t, Padding::kRoundUp); + } +} + +ImageBase::ImageBase(const size_t xsize, const size_t ysize, + const size_t bytes_per_row, void* const aligned) + : xsize_(static_cast(xsize)), + ysize_(static_cast(ysize)), + bytes_per_row_(bytes_per_row), + bytes_(static_cast(aligned), + AlignedFreer(&AlignedFreer::DoNothing, nullptr)) { + const size_t vec_size = VectorSize(); + HWY_ASSERT(bytes_per_row % vec_size == 0); + HWY_ASSERT(reinterpret_cast(aligned) % vec_size == 0); +} + +void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) { +#if HWY_IS_MSAN || HWY_IDE + if (xsize_ == 0 || ysize_ == 0) return; + + const size_t vec_size = VectorSize(); // Bytes, independent of sizeof_t! + if (vec_size == 1) return; // Scalar mode: no padding needed + + const size_t valid_size = xsize_ * sizeof_t; + const size_t initialize_size = padding == Padding::kRoundUp + ? RoundUpTo(valid_size, vec_size) + : valid_size + vec_size - sizeof_t; + if (valid_size == initialize_size) return; + + for (size_t y = 0; y < ysize_; ++y) { + uint8_t* HWY_RESTRICT row = static_cast(VoidRow(y)); +#if defined(__clang__) && (__clang_major__ <= 6) + // There's a bug in msan in clang-6 when handling AVX2 operations. This + // workaround allows tests to pass on msan, although it is slower and + // prevents msan warnings from uninitialized images. + memset(row, 0, initialize_size); +#else + memset(row + valid_size, 0, initialize_size - valid_size); +#endif // clang6 + } +#else + (void)sizeof_t; + (void)padding; +#endif // HWY_IS_MSAN +} + +void ImageBase::Swap(ImageBase& other) { + std::swap(xsize_, other.xsize_); + std::swap(ysize_, other.ysize_); + std::swap(bytes_per_row_, other.bytes_per_row_); + std::swap(bytes_, other.bytes_); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/image/image.h b/hwy/contrib/image/image.h new file mode 100644 index 0000000..231f3c5 --- /dev/null +++ b/hwy/contrib/image/image.h @@ -0,0 +1,471 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_ +#define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_ + +// SIMD/multicore-friendly planar image representation with row accessors. + +#include +#include +#include + +#include +#include // std::move + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/highway_export.h" + +namespace hwy { + +// Type-independent parts of Image<> - reduces code duplication and facilitates +// moving member function implementations to cc file. +struct HWY_CONTRIB_DLLEXPORT ImageBase { + // Returns required alignment in bytes for externally allocated memory. + static size_t VectorSize(); + + // Returns distance [bytes] between the start of two consecutive rows, a + // multiple of VectorSize but NOT kAlias (see implementation). + static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t); + + // No allocation (for output params or unused images) + ImageBase() + : xsize_(0), + ysize_(0), + bytes_per_row_(0), + bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {} + + // Allocates memory (this is the common case) + ImageBase(size_t xsize, size_t ysize, size_t sizeof_t); + + // References but does not take ownership of external memory. Useful for + // interoperability with other libraries. `aligned` must be aligned to a + // multiple of VectorSize() and `bytes_per_row` must also be a multiple of + // VectorSize() or preferably equal to BytesPerRow(). + ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned); + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo() instead. + ImageBase(const ImageBase& other) = delete; + ImageBase& operator=(const ImageBase& other) = delete; + + // Move constructor (required for returning Image from function) + ImageBase(ImageBase&& other) noexcept = default; + + // Move assignment (required for std::vector) + ImageBase& operator=(ImageBase&& other) noexcept = default; + + void Swap(ImageBase& other); + + // Useful for pre-allocating image with some padding for alignment purposes + // and later reporting the actual valid dimensions. Caller is responsible + // for ensuring xsize/ysize are <= the original dimensions. + void ShrinkTo(const size_t xsize, const size_t ysize) { + xsize_ = static_cast(xsize); + ysize_ = static_cast(ysize); + // NOTE: we can't recompute bytes_per_row for more compact storage and + // better locality because that would invalidate the image contents. + } + + // How many pixels. + HWY_INLINE size_t xsize() const { return xsize_; } + HWY_INLINE size_t ysize() const { return ysize_; } + + // NOTE: do not use this for copying rows - the valid xsize may be much less. + HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; } + + // Raw access to byte contents, for interfacing with other libraries. + // Unsigned char instead of char to avoid surprises (sign extension). + HWY_INLINE uint8_t* bytes() { + void* p = bytes_.get(); + return static_cast(HWY_ASSUME_ALIGNED(p, 64)); + } + HWY_INLINE const uint8_t* bytes() const { + const void* p = bytes_.get(); + return static_cast(HWY_ASSUME_ALIGNED(p, 64)); + } + + protected: + // Returns pointer to the start of a row. + HWY_INLINE void* VoidRow(const size_t y) const { +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN + if (y >= ysize_) { + HWY_ABORT("Row(%d) >= %u\n", static_cast(y), ysize_); + } +#endif + + void* row = bytes_.get() + y * bytes_per_row_; + return HWY_ASSUME_ALIGNED(row, 64); + } + + enum class Padding { + // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default. + kRoundUp, + // Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra + // vector to be initialized. If done by default, this would suppress + // legitimate msan warnings. We therefore require users to explicitly call + // InitializePadding before using unaligned loads (e.g. convolution). + kUnaligned + }; + + // Initializes the minimum bytes required to suppress msan warnings from + // legitimate (according to Padding mode) vector loads/stores on the right + // border, where some lanes are uninitialized and assumed to be unused. + void InitializePadding(size_t sizeof_t, Padding padding); + + // (Members are non-const to enable assignment during move-assignment.) + uint32_t xsize_; // In valid pixels, not including any padding. + uint32_t ysize_; + size_t bytes_per_row_; // Includes padding. + AlignedFreeUniquePtr bytes_; +}; + +// Single channel, aligned rows separated by padding. T must be POD. +// +// 'Single channel' (one 2D array per channel) simplifies vectorization +// (repeating the same operation on multiple adjacent components) without the +// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients +// can easily iterate over all components in a row and Image requires no +// knowledge of the pixel format beyond the component type "T". +// +// 'Aligned' means each row is aligned to the L1 cache line size. This prevents +// false sharing between two threads operating on adjacent rows. +// +// 'Padding' is still relevant because vectors could potentially be larger than +// a cache line. By rounding up row sizes to the vector size, we allow +// reading/writing ALIGNED vectors whose first lane is a valid sample. This +// avoids needing a separate loop to handle remaining unaligned lanes. +// +// This image layout could also be achieved with a vector and a row accessor +// function, but a class wrapper with support for "deleter" allows wrapping +// existing memory allocated by clients without copying the pixels. It also +// provides convenient accessors for xsize/ysize, which shortens function +// argument lists. Supports move-construction so it can be stored in containers. +template +class Image : public ImageBase { + public: + using T = ComponentType; + + Image() = default; + Image(const size_t xsize, const size_t ysize) + : ImageBase(xsize, ysize, sizeof(T)) {} + Image(const size_t xsize, const size_t ysize, size_t bytes_per_row, + void* aligned) + : ImageBase(xsize, ysize, bytes_per_row, aligned) {} + + void InitializePaddingForUnalignedAccesses() { + InitializePadding(sizeof(T), Padding::kUnaligned); + } + + HWY_INLINE const T* ConstRow(const size_t y) const { + return static_cast(VoidRow(y)); + } + HWY_INLINE const T* ConstRow(const size_t y) { + return static_cast(VoidRow(y)); + } + + // Returns pointer to non-const. This allows passing const Image* parameters + // when the callee is only supposed to fill the pixels, as opposed to + // allocating or resizing the image. + HWY_INLINE T* MutableRow(const size_t y) const { + return static_cast(VoidRow(y)); + } + HWY_INLINE T* MutableRow(const size_t y) { + return static_cast(VoidRow(y)); + } + + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must + // NOT be used to determine xsize. + HWY_INLINE intptr_t PixelsPerRow() const { + return static_cast(bytes_per_row_ / sizeof(T)); + } +}; + +using ImageF = Image; + +// A bundle of 3 same-sized images. To fill an existing Image3 using +// single-channel producers, we also need access to each const Image*. Const +// prevents breaking the same-size invariant, while still allowing pixels to be +// changed via MutableRow. +template +class Image3 { + public: + using T = ComponentType; + using ImageT = Image; + static constexpr size_t kNumPlanes = 3; + + Image3() : planes_{ImageT(), ImageT(), ImageT()} {} + + Image3(const size_t xsize, const size_t ysize) + : planes_{ImageT(xsize, ysize), ImageT(xsize, ysize), + ImageT(xsize, ysize)} {} + + Image3(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + } + + Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) { + if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) { + HWY_ABORT( + "Not same size: %d x %d, %d x %d, %d x %d\n", + static_cast(plane0.xsize()), static_cast(plane0.ysize()), + static_cast(plane1.xsize()), static_cast(plane1.ysize()), + static_cast(plane2.xsize()), static_cast(plane2.ysize())); + } + planes_[0] = std::move(plane0); + planes_[1] = std::move(plane1); + planes_[2] = std::move(plane2); + } + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo instead. + Image3(const Image3& other) = delete; + Image3& operator=(const Image3& other) = delete; + + Image3& operator=(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + return *this; + } + + HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const { + return static_cast(VoidPlaneRow(c, y)); + } + HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) { + return static_cast(VoidPlaneRow(c, y)); + } + + HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const { + return static_cast(VoidPlaneRow(c, y)); + } + HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) { + return static_cast(VoidPlaneRow(c, y)); + } + + HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; } + + void Swap(Image3& other) { + for (size_t c = 0; c < 3; ++c) { + other.planes_[c].Swap(planes_[c]); + } + } + + void ShrinkTo(const size_t xsize, const size_t ysize) { + for (ImageT& plane : planes_) { + plane.ShrinkTo(xsize, ysize); + } + } + + // Sizes of all three images are guaranteed to be equal. + HWY_INLINE size_t xsize() const { return planes_[0].xsize(); } + HWY_INLINE size_t ysize() const { return planes_[0].ysize(); } + // Returns offset [bytes] from one row to the next row of the same plane. + // WARNING: this must NOT be used to determine xsize, nor for copying rows - + // the valid xsize may be much less. + HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); } + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must NOT be used + // to determine xsize. + HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); } + + private: + // Returns pointer to the start of a row. + HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const { +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN + if (c >= kNumPlanes || y >= ysize()) { + HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast(c), + static_cast(y), static_cast(ysize())); + } +#endif + // Use the first plane's stride because the compiler might not realize they + // are all equal. Thus we only need a single multiplication for all planes. + const size_t row_offset = y * planes_[0].bytes_per_row(); + const void* row = planes_[c].bytes() + row_offset; + return static_cast( + HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT)); + } + + private: + ImageT planes_[kNumPlanes]; +}; + +using Image3F = Image3; + +// Rectangular region in image(s). Factoring this out of Image instead of +// shifting the pointer by x0/y0 allows this to apply to multiple images with +// different resolutions. Can compare size via SameSize(rect1, rect2). +class Rect { + public: + // Most windows are xsize_max * ysize_max, except those on the borders where + // begin + size_max > end. + constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max, + size_t ysize_max, size_t xend, size_t yend) + : x0_(xbegin), + y0_(ybegin), + xsize_(ClampedSize(xbegin, xsize_max, xend)), + ysize_(ClampedSize(ybegin, ysize_max, yend)) {} + + // Construct with origin and known size (typically from another Rect). + constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize) + : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {} + + // Construct a rect that covers a whole image. + template + explicit Rect(const Image& image) + : Rect(0, 0, image.xsize(), image.ysize()) {} + + Rect() : Rect(0, 0, 0, 0) {} + + Rect(const Rect&) = default; + Rect& operator=(const Rect&) = default; + + Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max, + size_t ysize_max) { + return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_, + y0_ + ysize_); + } + + template + const T* ConstRow(const Image* image, size_t y) const { + return image->ConstRow(y + y0_) + x0_; + } + + template + T* MutableRow(const Image* image, size_t y) const { + return image->MutableRow(y + y0_) + x0_; + } + + template + const T* ConstPlaneRow(const Image3& image, size_t c, size_t y) const { + return image.ConstPlaneRow(c, y + y0_) + x0_; + } + + template + T* MutablePlaneRow(Image3* image, const size_t c, size_t y) const { + return image->MutablePlaneRow(c, y + y0_) + x0_; + } + + // Returns true if this Rect fully resides in the given image. ImageT could be + // Image or Image3; however if ImageT is Rect, results are nonsensical. + template + bool IsInside(const ImageT& image) const { + return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize()); + } + + size_t x0() const { return x0_; } + size_t y0() const { return y0_; } + size_t xsize() const { return xsize_; } + size_t ysize() const { return ysize_; } + + private: + // Returns size_max, or whatever is left in [begin, end). + static constexpr size_t ClampedSize(size_t begin, size_t size_max, + size_t end) { + return (begin + size_max <= end) ? size_max + : (end > begin ? end - begin : 0); + } + + size_t x0_; + size_t y0_; + + size_t xsize_; + size_t ysize_; +}; + +// Works for any image-like input type(s). +template +HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) { + return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize(); +} + +// Mirrors out of bounds coordinates and returns valid coordinates unchanged. +// We assume the radius (distance outside the image) is small compared to the +// image size, otherwise this might not terminate. +// The mirror is outside the last column (border pixel is also replicated). +static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x, + const int64_t xsize) { + HWY_DASSERT(xsize != 0); + + // TODO(janwas): replace with branchless version + while (x < 0 || x >= xsize) { + if (x < 0) { + x = -x - 1; + } else { + x = 2 * xsize - 1 - x; + } + } + return static_cast(x); +} + +// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size): + +// Mirrors (repeating the edge pixel once). Useful for convolutions. +struct WrapMirror { + HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const { + return Mirror(coord, static_cast(size)); + } +}; + +// Returns the same coordinate, for when we know "coord" is already valid (e.g. +// interior of an image). +struct WrapUnchanged { + HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const { + return static_cast(coord); + } +}; + +// Similar to Wrap* but for row pointers (reduces Row() multiplications). + +class WrapRowMirror { + public: + template + WrapRowMirror(const View& image, size_t ysize) + : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {} + + const float* operator()(const float* const HWY_RESTRICT row, + const int64_t stride) const { + if (row < first_row_) { + const int64_t num_before = first_row_ - row; + // Mirrored; one row before => row 0, two before = row 1, ... + return first_row_ + num_before - stride; + } + if (row > last_row_) { + const int64_t num_after = row - last_row_; + // Mirrored; one row after => last row, two after = last - 1, ... + return last_row_ - num_after + stride; + } + return row; + } + + private: + const float* const HWY_RESTRICT first_row_; + const float* const HWY_RESTRICT last_row_; +}; + +struct WrapRowUnchanged { + HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row, + int64_t /*stride*/) const { + return row; + } +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_ diff --git a/hwy/contrib/image/image_test.cc b/hwy/contrib/image/image_test.cc new file mode 100644 index 0000000..6886577 --- /dev/null +++ b/hwy/contrib/image/image_test.cc @@ -0,0 +1,152 @@ +// Copyright (c) the JPEG XL Project +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/image/image.h" + +#include +#include +#include +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target: +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Ensure we can always write full aligned vectors. +struct TestAlignedT { + template + void operator()(T /*unused*/) const { + std::mt19937 rng(129); + std::uniform_int_distribution dist(0, 16); + const ScalableTag d; + + for (size_t ysize = 1; ysize < 4; ++ysize) { + for (size_t xsize = 1; xsize < 64; ++xsize) { + Image img(xsize, ysize); + + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto values = Iota(d, static_cast(dist(rng))); + Store(values, d, row + x); + } + } + + // Sanity check to prevent optimizing out the writes + const auto x = std::uniform_int_distribution(0, xsize - 1)(rng); + const auto y = std::uniform_int_distribution(0, ysize - 1)(rng); + HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d)); + } + } + } +}; + +void TestAligned() { ForUnsignedTypes(TestAlignedT()); } + +// Ensure we can write an unaligned vector starting at the last valid value. +struct TestUnalignedT { + template + void operator()(T /*unused*/) const { + std::mt19937 rng(129); + std::uniform_int_distribution dist(0, 3); + const ScalableTag d; + + for (size_t ysize = 1; ysize < 4; ++ysize) { + for (size_t xsize = 1; xsize < 128; ++xsize) { + Image img(xsize, ysize); + img.InitializePaddingForUnalignedAccesses(); + +// This test reads padding, which only works if it was initialized, +// which only happens in MSAN builds. +#if HWY_IS_MSAN || HWY_IDE + // Initialize only the valid samples + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; ++x) { + row[x] = static_cast(1u << dist(rng)); + } + } + + // Read padding bits + auto accum = Zero(d); + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; ++x) { + accum = Or(accum, LoadU(d, row + x)); + } + } + + // Ensure padding was zero + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + Store(accum, d, lanes.get()); + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(lanes[i] < 16); + } +#else // Check that writing padding does not overwrite valid samples + // Initialize only the valid samples + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; ++x) { + row[x] = static_cast(x); + } + } + + // Zero padding and rightmost sample + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + StoreU(Zero(d), d, row + xsize - 1); + } + + // Ensure no samples except the rightmost were overwritten + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize - 1; ++x) { + HWY_ASSERT_EQ(static_cast(x), row[x]); + } + } +#endif + } + } + } +}; + +void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(ImageTest); +HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned); +HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned); +} // namespace hwy + +#endif diff --git a/hwy/contrib/math/math-inl.h b/hwy/contrib/math/math-inl.h new file mode 100644 index 0000000..b4cbb5d --- /dev/null +++ b/hwy/contrib/math/math-inl.h @@ -0,0 +1,1242 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Include guard (still compiled once per target) +#if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +/** + * Highway SIMD version of std::acos(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: [-1, +1] + * @return arc cosine of 'x' + */ +template +HWY_INLINE V Acos(const D d, V x); +template +HWY_NOINLINE V CallAcos(const D d, VecArg x) { + return Acos(d, x); +} + +/** + * Highway SIMD version of std::acosh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: float32[1, +FLT_MAX], float64[1, +DBL_MAX] + * @return hyperbolic arc cosine of 'x' + */ +template +HWY_INLINE V Acosh(const D d, V x); +template +HWY_NOINLINE V CallAcosh(const D d, VecArg x) { + return Acosh(d, x); +} + +/** + * Highway SIMD version of std::asin(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: [-1, +1] + * @return arc sine of 'x' + */ +template +HWY_INLINE V Asin(const D d, V x); +template +HWY_NOINLINE V CallAsin(const D d, VecArg x) { + return Asin(d, x); +} + +/** + * Highway SIMD version of std::asinh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] + * @return hyperbolic arc sine of 'x' + */ +template +HWY_INLINE V Asinh(const D d, V x); +template +HWY_NOINLINE V CallAsinh(const D d, VecArg x) { + return Asinh(d, x); +} + +/** + * Highway SIMD version of std::atan(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] + * @return arc tangent of 'x' + */ +template +HWY_INLINE V Atan(const D d, V x); +template +HWY_NOINLINE V CallAtan(const D d, VecArg x) { + return Atan(d, x); +} + +/** + * Highway SIMD version of std::atanh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: (-1, +1) + * @return hyperbolic arc tangent of 'x' + */ +template +HWY_INLINE V Atanh(const D d, V x); +template +HWY_NOINLINE V CallAtanh(const D d, VecArg x) { + return Atanh(d, x); +} + +/** + * Highway SIMD version of std::cos(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: [-39000, +39000] + * @return cosine of 'x' + */ +template +HWY_INLINE V Cos(const D d, V x); +template +HWY_NOINLINE V CallCos(const D d, VecArg x) { + return Cos(d, x); +} + +/** + * Highway SIMD version of std::exp(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 1 + * Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] + * @return e^x + */ +template +HWY_INLINE V Exp(const D d, V x); +template +HWY_NOINLINE V CallExp(const D d, VecArg x) { + return Exp(d, x); +} + +/** + * Highway SIMD version of std::expm1(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] + * @return e^x - 1 + */ +template +HWY_INLINE V Expm1(const D d, V x); +template +HWY_NOINLINE V CallExpm1(const D d, VecArg x) { + return Expm1(d, x); +} + +/** + * Highway SIMD version of std::log(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] + * @return natural logarithm of 'x' + */ +template +HWY_INLINE V Log(const D d, V x); +template +HWY_NOINLINE V CallLog(const D d, VecArg x) { + return Log(d, x); +} + +/** + * Highway SIMD version of std::log10(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] + * @return base 10 logarithm of 'x' + */ +template +HWY_INLINE V Log10(const D d, V x); +template +HWY_NOINLINE V CallLog10(const D d, VecArg x) { + return Log10(d, x); +} + +/** + * Highway SIMD version of std::log1p(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: float32[0, +FLT_MAX], float64[0, +DBL_MAX] + * @return log(1 + x) + */ +template +HWY_INLINE V Log1p(const D d, V x); +template +HWY_NOINLINE V CallLog1p(const D d, VecArg x) { + return Log1p(d, x); +} + +/** + * Highway SIMD version of std::log2(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] + * @return base 2 logarithm of 'x' + */ +template +HWY_INLINE V Log2(const D d, V x); +template +HWY_NOINLINE V CallLog2(const D d, VecArg x) { + return Log2(d, x); +} + +/** + * Highway SIMD version of std::sin(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: [-39000, +39000] + * @return sine of 'x' + */ +template +HWY_INLINE V Sin(const D d, V x); +template +HWY_NOINLINE V CallSin(const D d, VecArg x) { + return Sin(d, x); +} + +/** + * Highway SIMD version of std::sinh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32[-88.7228, +88.7228], float64[-709, +709] + * @return hyperbolic sine of 'x' + */ +template +HWY_INLINE V Sinh(const D d, V x); +template +HWY_NOINLINE V CallSinh(const D d, VecArg x) { + return Sinh(d, x); +} + +/** + * Highway SIMD version of std::tanh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] + * @return hyperbolic tangent of 'x' + */ +template +HWY_INLINE V Tanh(const D d, V x); +template +HWY_NOINLINE V CallTanh(const D d, VecArg x) { + return Tanh(d, x); +} + +//////////////////////////////////////////////////////////////////////////////// +// Implementation +//////////////////////////////////////////////////////////////////////////////// +namespace impl { + +// Estrin's Scheme is a faster method for evaluating large polynomials on +// super scalar architectures. It works by factoring the Horner's Method +// polynomial into power of two sub-trees that can be evaluated in parallel. +// Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) { + return MulAdd(c1, x, c0); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) { + T x2 = Mul(x, x); + return MulAdd(x2, c2, MulAdd(c1, x, c0)); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) { + T x2 = Mul(x, x); + return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, MulAdd(c5, x, c4), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, c8, + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, MulAdd(c9, x, c8), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd( + x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, + MulAdd(x4, MulAdd(c13, x, c12), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, + MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15, T c16) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + T x16 = Mul(x8, x8); + return MulAdd( + x16, c16, + MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15, T c16, T c17) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + T x16 = Mul(x8, x8); + return MulAdd( + x16, MulAdd(c17, x, c16), + MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); +} +template +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15, T c16, T c17, + T c18) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + T x16 = Mul(x8, x8); + return MulAdd( + x16, MulAdd(x2, c18, MulAdd(c17, x, c16)), + MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); +} + +template +struct AsinImpl {}; +template +struct AtanImpl {}; +template +struct CosSinImpl {}; +template +struct ExpImpl {}; +template +struct LogImpl {}; + +template <> +struct AsinImpl { + // Polynomial approximation for asin(x) over the range [0, 0.5). + template + HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { + const auto k0 = Set(d, +0.1666677296f); + const auto k1 = Set(d, +0.07495029271f); + const auto k2 = Set(d, +0.04547423869f); + const auto k3 = Set(d, +0.02424046025f); + const auto k4 = Set(d, +0.04197454825f); + + return Estrin(x2, k0, k1, k2, k3, k4); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 + +template <> +struct AsinImpl { + // Polynomial approximation for asin(x) over the range [0, 0.5). + template + HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { + const auto k0 = Set(d, +0.1666666666666497543); + const auto k1 = Set(d, +0.07500000000378581611); + const auto k2 = Set(d, +0.04464285681377102438); + const auto k3 = Set(d, +0.03038195928038132237); + const auto k4 = Set(d, +0.02237176181932048341); + const auto k5 = Set(d, +0.01735956991223614604); + const auto k6 = Set(d, +0.01388715184501609218); + const auto k7 = Set(d, +0.01215360525577377331); + const auto k8 = Set(d, +0.006606077476277170610); + const auto k9 = Set(d, +0.01929045477267910674); + const auto k10 = Set(d, -0.01581918243329996643); + const auto k11 = Set(d, +0.03161587650653934628); + + return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11); + } +}; + +#endif + +template <> +struct AtanImpl { + // Polynomial approximation for atan(x) over the range [0, 1.0). + template + HWY_INLINE V AtanPoly(D d, V x) { + const auto k0 = Set(d, -0.333331018686294555664062f); + const auto k1 = Set(d, +0.199926957488059997558594f); + const auto k2 = Set(d, -0.142027363181114196777344f); + const auto k3 = Set(d, +0.106347933411598205566406f); + const auto k4 = Set(d, -0.0748900920152664184570312f); + const auto k5 = Set(d, +0.0425049886107444763183594f); + const auto k6 = Set(d, -0.0159569028764963150024414f); + const auto k7 = Set(d, +0.00282363896258175373077393f); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 + +template <> +struct AtanImpl { + // Polynomial approximation for atan(x) over the range [0, 1.0). + template + HWY_INLINE V AtanPoly(D d, V x) { + const auto k0 = Set(d, -0.333333333333311110369124); + const auto k1 = Set(d, +0.199999999996591265594148); + const auto k2 = Set(d, -0.14285714266771329383765); + const auto k3 = Set(d, +0.111111105648261418443745); + const auto k4 = Set(d, -0.090908995008245008229153); + const auto k5 = Set(d, +0.0769219538311769618355029); + const auto k6 = Set(d, -0.0666573579361080525984562); + const auto k7 = Set(d, +0.0587666392926673580854313); + const auto k8 = Set(d, -0.0523674852303482457616113); + const auto k9 = Set(d, +0.0466667150077840625632675); + const auto k10 = Set(d, -0.0407629191276836500001934); + const auto k11 = Set(d, +0.0337852580001353069993897); + const auto k12 = Set(d, -0.0254517624932312641616861); + const auto k13 = Set(d, +0.016599329773529201970117); + const auto k14 = Set(d, -0.00889896195887655491740809); + const auto k15 = Set(d, +0.00370026744188713119232403); + const auto k16 = Set(d, -0.00110611831486672482563471); + const auto k17 = Set(d, +0.000209850076645816976906797); + const auto k18 = Set(d, -1.88796008463073496563746e-5); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, + k12, k13, k14, k15, k16, k17, k18), + Mul(y, x), x); + } +}; + +#endif + +template <> +struct CosSinImpl { + // Rounds float toward zero and returns as int32_t. + template + HWY_INLINE Vec> ToInt32(D /*unused*/, V x) { + return ConvertTo(Rebind(), x); + } + + template + HWY_INLINE V Poly(D d, V x) { + const auto k0 = Set(d, -1.66666597127914428710938e-1f); + const auto k1 = Set(d, +8.33307858556509017944336e-3f); + const auto k2 = Set(d, -1.981069071916863322258e-4f); + const auto k3 = Set(d, +2.6083159809786593541503e-6f); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x); + } + + template + HWY_INLINE V CosReduce(D d, V x, VI32 q) { + // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2 + const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f); + const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f); + const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f); + const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f); + + // Extended precision modular arithmetic. + const V qf = ConvertTo(d, q); + x = MulAdd(qf, kHalfPiPart0f, x); + x = MulAdd(qf, kHalfPiPart1f, x); + x = MulAdd(qf, kHalfPiPart2f, x); + x = MulAdd(qf, kHalfPiPart3f, x); + return x; + } + + template + HWY_INLINE V SinReduce(D d, V x, VI32 q) { + // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi + const V kPiPart0f = Set(d, -3.140625f); + const V kPiPart1f = Set(d, -0.0009670257568359375f); + const V kPiPart2f = Set(d, -6.2771141529083251953e-7f); + const V kPiPart3f = Set(d, -1.2154201256553420762e-10f); + + // Extended precision modular arithmetic. + const V qf = ConvertTo(d, q); + x = MulAdd(qf, kPiPart0f, x); + x = MulAdd(qf, kPiPart1f, x); + x = MulAdd(qf, kPiPart2f, x); + x = MulAdd(qf, kPiPart3f, x); + return x; + } + + // (q & 2) == 0 ? -0.0 : +0.0 + template + HWY_INLINE Vec> CosSignFromQuadrant(D d, VI32 q) { + const VI32 kTwo = Set(Rebind(), 2); + return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo))); + } + + // ((q & 1) ? -0.0 : +0.0) + template + HWY_INLINE Vec> SinSignFromQuadrant(D d, VI32 q) { + const VI32 kOne = Set(Rebind(), 1); + return BitCast(d, ShiftLeft<31>(And(q, kOne))); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 + +template <> +struct CosSinImpl { + // Rounds double toward zero and returns as int32_t. + template + HWY_INLINE Vec> ToInt32(D /*unused*/, V x) { + return DemoteTo(Rebind(), x); + } + + template + HWY_INLINE V Poly(D d, V x) { + const auto k0 = Set(d, -0.166666666666666657414808); + const auto k1 = Set(d, +0.00833333333333332974823815); + const auto k2 = Set(d, -0.000198412698412696162806809); + const auto k3 = Set(d, +2.75573192239198747630416e-6); + const auto k4 = Set(d, -2.50521083763502045810755e-8); + const auto k5 = Set(d, +1.60590430605664501629054e-10); + const auto k6 = Set(d, -7.64712219118158833288484e-13); + const auto k7 = Set(d, +2.81009972710863200091251e-15); + const auto k8 = Set(d, -7.97255955009037868891952e-18); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x); + } + + template + HWY_INLINE V CosReduce(D d, V x, VI32 q) { + // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2 + const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914); + const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8); + const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16); + const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24); + + // Extended precision modular arithmetic. + const V qf = PromoteTo(d, q); + x = MulAdd(qf, kHalfPiPart0d, x); + x = MulAdd(qf, kHalfPiPart1d, x); + x = MulAdd(qf, kHalfPiPart2d, x); + x = MulAdd(qf, kHalfPiPart3d, x); + return x; + } + + template + HWY_INLINE V SinReduce(D d, V x, VI32 q) { + // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi + const V kPiPart0d = Set(d, -3.1415926218032836914); + const V kPiPart1d = Set(d, -3.1786509424591713469e-8); + const V kPiPart2d = Set(d, -1.2246467864107188502e-16); + const V kPiPart3d = Set(d, -1.2736634327021899816e-24); + + // Extended precision modular arithmetic. + const V qf = PromoteTo(d, q); + x = MulAdd(qf, kPiPart0d, x); + x = MulAdd(qf, kPiPart1d, x); + x = MulAdd(qf, kPiPart2d, x); + x = MulAdd(qf, kPiPart3d, x); + return x; + } + + // (q & 2) == 0 ? -0.0 : +0.0 + template + HWY_INLINE Vec> CosSignFromQuadrant(D d, VI32 q) { + const VI32 kTwo = Set(Rebind(), 2); + return BitCast( + d, ShiftLeft<62>(PromoteTo(Rebind(), AndNot(q, kTwo)))); + } + + // ((q & 1) ? -0.0 : +0.0) + template + HWY_INLINE Vec> SinSignFromQuadrant(D d, VI32 q) { + const VI32 kOne = Set(Rebind(), 1); + return BitCast( + d, ShiftLeft<63>(PromoteTo(Rebind(), And(q, kOne)))); + } +}; + +#endif + +template <> +struct ExpImpl { + // Rounds float toward zero and returns as int32_t. + template + HWY_INLINE Vec> ToInt32(D /*unused*/, V x) { + return ConvertTo(Rebind(), x); + } + + template + HWY_INLINE V ExpPoly(D d, V x) { + const auto k0 = Set(d, +0.5f); + const auto k1 = Set(d, +0.166666671633720397949219f); + const auto k2 = Set(d, +0.0416664853692054748535156f); + const auto k3 = Set(d, +0.00833336077630519866943359f); + const auto k4 = Set(d, +0.00139304355252534151077271f); + const auto k5 = Set(d, +0.000198527617612853646278381f); + + return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x); + } + + // Computes 2^x, where x is an integer. + template + HWY_INLINE Vec Pow2I(D d, VI32 x) { + const Rebind di32; + const VI32 kOffset = Set(di32, 0x7F); + return BitCast(d, ShiftLeft<23>(Add(x, kOffset))); + } + + // Sets the exponent of 'x' to 2^e. + template + HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { + const VI32 y = ShiftRight<1>(e); + return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); + } + + template + HWY_INLINE V ExpReduce(D d, V x, VI32 q) { + // kLn2Part0f + kLn2Part1f ~= -ln(2) + const V kLn2Part0f = Set(d, -0.693145751953125f); + const V kLn2Part1f = Set(d, -1.428606765330187045e-6f); + + // Extended precision modular arithmetic. + const V qf = ConvertTo(d, q); + x = MulAdd(qf, kLn2Part0f, x); + x = MulAdd(qf, kLn2Part1f, x); + return x; + } +}; + +template <> +struct LogImpl { + template + HWY_INLINE Vec> Log2p1NoSubnormal(D /*d*/, V x) { + const Rebind di32; + const Rebind du32; + const auto kBias = Set(di32, 0x7F); + return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias); + } + + // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. + template + HWY_INLINE V LogPoly(D d, V x) { + const V k0 = Set(d, 0.66666662693f); + const V k1 = Set(d, 0.40000972152f); + const V k2 = Set(d, 0.28498786688f); + const V k3 = Set(d, 0.24279078841f); + + const V x2 = Mul(x, x); + const V x4 = Mul(x2, x2); + return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4)); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 +template <> +struct ExpImpl { + // Rounds double toward zero and returns as int32_t. + template + HWY_INLINE Vec> ToInt32(D /*unused*/, V x) { + return DemoteTo(Rebind(), x); + } + + template + HWY_INLINE V ExpPoly(D d, V x) { + const auto k0 = Set(d, +0.5); + const auto k1 = Set(d, +0.166666666666666851703837); + const auto k2 = Set(d, +0.0416666666666665047591422); + const auto k3 = Set(d, +0.00833333333331652721664984); + const auto k4 = Set(d, +0.00138888888889774492207962); + const auto k5 = Set(d, +0.000198412698960509205564975); + const auto k6 = Set(d, +2.4801587159235472998791e-5); + const auto k7 = Set(d, +2.75572362911928827629423e-6); + const auto k8 = Set(d, +2.75573911234900471893338e-7); + const auto k9 = Set(d, +2.51112930892876518610661e-8); + const auto k10 = Set(d, +2.08860621107283687536341e-9); + + return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10), + Mul(x, x), x); + } + + // Computes 2^x, where x is an integer. + template + HWY_INLINE Vec Pow2I(D d, VI32 x) { + const Rebind di32; + const Rebind di64; + const VI32 kOffset = Set(di32, 0x3FF); + return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset)))); + } + + // Sets the exponent of 'x' to 2^e. + template + HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { + const VI32 y = ShiftRight<1>(e); + return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); + } + + template + HWY_INLINE V ExpReduce(D d, V x, VI32 q) { + // kLn2Part0d + kLn2Part1d ~= -ln(2) + const V kLn2Part0d = Set(d, -0.6931471805596629565116018); + const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12); + + // Extended precision modular arithmetic. + const V qf = PromoteTo(d, q); + x = MulAdd(qf, kLn2Part0d, x); + x = MulAdd(qf, kLn2Part1d, x); + return x; + } +}; + +template <> +struct LogImpl { + template + HWY_INLINE Vec> Log2p1NoSubnormal(D /*d*/, V x) { + const Rebind di64; + const Rebind du64; + return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))), + Set(di64, 0x3FF)); + } + + // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. + template + HWY_INLINE V LogPoly(D d, V x) { + const V k0 = Set(d, 0.6666666666666735130); + const V k1 = Set(d, 0.3999999999940941908); + const V k2 = Set(d, 0.2857142874366239149); + const V k3 = Set(d, 0.2222219843214978396); + const V k4 = Set(d, 0.1818357216161805012); + const V k5 = Set(d, 0.1531383769920937332); + const V k6 = Set(d, 0.1479819860511658591); + + const V x2 = Mul(x, x); + const V x4 = Mul(x2, x2); + return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2, + (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4))); + } +}; + +#endif + +template +HWY_INLINE V Log(const D d, V x) { + // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info. + using T = TFromD; + impl::LogImpl impl; + + constexpr bool kIsF32 = (sizeof(T) == 4); + + // Float Constants + const V kLn2Hi = Set(d, kIsF32 ? static_cast(0.69313812256f) + : static_cast(0.693147180369123816490)); + const V kLn2Lo = Set(d, kIsF32 ? static_cast(9.0580006145e-6f) + : static_cast(1.90821492927058770002e-10)); + const V kOne = Set(d, static_cast(+1.0)); + const V kMinNormal = Set(d, kIsF32 ? static_cast(1.175494351e-38f) + : static_cast(2.2250738585072014e-308)); + const V kScale = Set(d, kIsF32 ? static_cast(3.355443200e+7f) + : static_cast(1.8014398509481984e+16)); + + // Integer Constants + using TI = MakeSigned; + const Rebind di; + using VI = decltype(Zero(di)); + const VI kLowerBits = Set(di, kIsF32 ? static_cast(0x00000000L) + : static_cast(0xFFFFFFFFLL)); + const VI kMagic = Set(di, kIsF32 ? static_cast(0x3F3504F3L) + : static_cast(0x3FE6A09E00000000LL)); + const VI kExpMask = Set(di, kIsF32 ? static_cast(0x3F800000L) + : static_cast(0x3FF0000000000000LL)); + const VI kExpScale = + Set(di, kIsF32 ? static_cast(-25) : static_cast(-54)); + const VI kManMask = Set(di, kIsF32 ? static_cast(0x7FFFFFL) + : static_cast(0xFFFFF00000000LL)); + + // Scale up 'x' so that it is no longer denormalized. + VI exp_bits; + V exp; + if (kAllowSubnormals == true) { + const auto is_denormal = Lt(x, kMinNormal); + x = IfThenElse(is_denormal, Mul(x, kScale), x); + + // Compute the new exponent. + exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); + const VI exp_scale = + BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale))); + exp = ConvertTo( + d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)))); + } else { + // Compute the new exponent. + exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); + exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))); + } + + // Renormalize. + const V y = Or(And(x, BitCast(d, kLowerBits)), + BitCast(d, Add(And(exp_bits, kManMask), kMagic))); + + // Approximate and reconstruct. + const V ym1 = Sub(y, kOne); + const V z = Div(ym1, Add(y, kOne)); + + return MulSub( + exp, kLn2Hi, + Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1)); +} + +} // namespace impl + +template +HWY_INLINE V Acos(const D d, V x) { + using T = TFromD; + + const V kZero = Zero(d); + const V kHalf = Set(d, static_cast(+0.5)); + const V kPi = Set(d, static_cast(+3.14159265358979323846264)); + const V kPiOverTwo = Set(d, static_cast(+1.57079632679489661923132169)); + + const V sign_x = And(SignBit(d), x); + const V abs_x = Xor(x, sign_x); + const auto mask = Lt(abs_x, kHalf); + const V yy = + IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); + const V y = IfThenElse(mask, abs_x, Sqrt(yy)); + + impl::AsinImpl impl; + const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy)); + + const V t_plus_y = Add(t, y); + const V z = + IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))), + Add(t_plus_y, t_plus_y)); + return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z)); +} + +template +HWY_INLINE V Acosh(const D d, V x) { + using T = TFromD; + + const V kLarge = Set(d, static_cast(268435456.0)); + const V kLog2 = Set(d, static_cast(0.693147180559945286227)); + const V kOne = Set(d, static_cast(+1.0)); + const V kTwo = Set(d, static_cast(+2.0)); + + const auto is_x_large = Gt(x, kLarge); + const auto is_x_gt_2 = Gt(x, kTwo); + + const V x_minus_1 = Sub(x, kOne); + const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x))); + const V y1 = + Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1); + const V y2 = + IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne)); + const V z = impl::Log(d, y2); + + const auto is_pole = Eq(y2, kOne); + const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); + return Add(IfThenElse(is_x_gt_2, z, + IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))), + IfThenElseZero(is_x_large, kLog2)); +} + +template +HWY_INLINE V Asin(const D d, V x) { + using T = TFromD; + + const V kHalf = Set(d, static_cast(+0.5)); + const V kTwo = Set(d, static_cast(+2.0)); + const V kPiOverTwo = Set(d, static_cast(+1.57079632679489661923132169)); + + const V sign_x = And(SignBit(d), x); + const V abs_x = Xor(x, sign_x); + const auto mask = Lt(abs_x, kHalf); + const V yy = + IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); + const V y = IfThenElse(mask, abs_x, Sqrt(yy)); + + impl::AsinImpl impl; + const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y); + const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo); + return Or(IfThenElse(mask, z0, z1), sign_x); +} + +template +HWY_INLINE V Asinh(const D d, V x) { + using T = TFromD; + + const V kSmall = Set(d, static_cast(1.0 / 268435456.0)); + const V kLarge = Set(d, static_cast(268435456.0)); + const V kLog2 = Set(d, static_cast(0.693147180559945286227)); + const V kOne = Set(d, static_cast(+1.0)); + const V kTwo = Set(d, static_cast(+2.0)); + + const V sign_x = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign_x); + + const auto is_x_large = Gt(abs_x, kLarge); + const auto is_x_lt_2 = Lt(abs_x, kTwo); + + const V x2 = Mul(x, x); + const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne)); + + const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x))); + const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x); + const V y2 = + IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0)); + const V z = impl::Log(d, y2); + + const auto is_pole = Eq(y2, kOne); + const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); + const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor)); + const V y = IfThenElse(Lt(abs_x, kSmall), x, large); + return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)), + sign_x); +} + +template +HWY_INLINE V Atan(const D d, V x) { + using T = TFromD; + + const V kOne = Set(d, static_cast(+1.0)); + const V kPiOverTwo = Set(d, static_cast(+1.57079632679489661923132169)); + + const V sign = And(SignBit(d), x); + const V abs_x = Xor(x, sign); + const auto mask = Gt(abs_x, kOne); + + impl::AtanImpl impl; + const auto divisor = IfThenElse(mask, abs_x, kOne); + const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x)); + return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign); +} + +template +HWY_INLINE V Atanh(const D d, V x) { + using T = TFromD; + + const V kHalf = Set(d, static_cast(+0.5)); + const V kOne = Set(d, static_cast(+1.0)); + + const V sign = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign); + return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))), + Xor(kHalf, sign)); +} + +template +HWY_INLINE V Cos(const D d, V x) { + using T = TFromD; + impl::CosSinImpl impl; + + // Float Constants + const V kOneOverPi = Set(d, static_cast(0.31830988618379067153)); + + // Integer Constants + const Rebind di32; + using VI32 = decltype(Zero(di32)); + const VI32 kOne = Set(di32, 1); + + const V y = Abs(x); // cos(x) == cos(|x|) + + // Compute the quadrant, q = int(|x| / pi) * 2 + 1 + const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne); + + // Reduce range, apply sign, and approximate. + return impl.Poly( + d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q))); +} + +template +HWY_INLINE V Exp(const D d, V x) { + using T = TFromD; + + const V kHalf = Set(d, static_cast(+0.5)); + const V kLowerBound = + Set(d, static_cast((sizeof(T) == 4 ? -104.0 : -1000.0))); + const V kNegZero = Set(d, static_cast(-0.0)); + const V kOne = Set(d, static_cast(+1.0)); + const V kOneOverLog2 = Set(d, static_cast(+1.442695040888963407359924681)); + + impl::ExpImpl impl; + + // q = static_cast((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) + const auto q = + impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); + + // Reduce, approximate, and then reconstruct. + const V y = impl.LoadExpShortRange( + d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q); + return IfThenElseZero(Ge(x, kLowerBound), y); +} + +template +HWY_INLINE V Expm1(const D d, V x) { + using T = TFromD; + + const V kHalf = Set(d, static_cast(+0.5)); + const V kLowerBound = + Set(d, static_cast((sizeof(T) == 4 ? -104.0 : -1000.0))); + const V kLn2Over2 = Set(d, static_cast(+0.346573590279972654708616)); + const V kNegOne = Set(d, static_cast(-1.0)); + const V kNegZero = Set(d, static_cast(-0.0)); + const V kOne = Set(d, static_cast(+1.0)); + const V kOneOverLog2 = Set(d, static_cast(+1.442695040888963407359924681)); + + impl::ExpImpl impl; + + // q = static_cast((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) + const auto q = + impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); + + // Reduce, approximate, and then reconstruct. + const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q)); + const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y, + Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne)); + return IfThenElse(Lt(x, kLowerBound), kNegOne, z); +} + +template +HWY_INLINE V Log(const D d, V x) { + return impl::Log(d, x); +} + +template +HWY_INLINE V Log10(const D d, V x) { + using T = TFromD; + return Mul(Log(d, x), Set(d, static_cast(0.4342944819032518276511))); +} + +template +HWY_INLINE V Log1p(const D d, V x) { + using T = TFromD; + const V kOne = Set(d, static_cast(+1.0)); + + const V y = Add(x, kOne); + const auto is_pole = Eq(y, kOne); + const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne); + const auto non_pole = + Mul(impl::Log(d, y), Div(x, divisor)); + return IfThenElse(is_pole, x, non_pole); +} + +template +HWY_INLINE V Log2(const D d, V x) { + using T = TFromD; + return Mul(Log(d, x), Set(d, static_cast(1.44269504088896340735992))); +} + +template +HWY_INLINE V Sin(const D d, V x) { + using T = TFromD; + impl::CosSinImpl impl; + + // Float Constants + const V kOneOverPi = Set(d, static_cast(0.31830988618379067153)); + const V kHalf = Set(d, static_cast(0.5)); + + // Integer Constants + const Rebind di32; + using VI32 = decltype(Zero(di32)); + + const V abs_x = Abs(x); + const V sign_x = Xor(abs_x, x); + + // Compute the quadrant, q = int((|x| / pi) + 0.5) + const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf)); + + // Reduce range, apply sign, and approximate. + return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q), + Xor(impl.SinSignFromQuadrant(d, q), sign_x))); +} + +template +HWY_INLINE V Sinh(const D d, V x) { + using T = TFromD; + const V kHalf = Set(d, static_cast(+0.5)); + const V kOne = Set(d, static_cast(+1.0)); + const V kTwo = Set(d, static_cast(+2.0)); + + const V sign = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign); + const V y = Expm1(d, abs_x); + const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf)); + return Xor(z, sign); // Reapply the sign bit +} + +template +HWY_INLINE V Tanh(const D d, V x) { + using T = TFromD; + const V kLimit = Set(d, static_cast(18.714973875)); + const V kOne = Set(d, static_cast(+1.0)); + const V kTwo = Set(d, static_cast(+2.0)); + + const V sign = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign); + const V y = Expm1(d, Mul(abs_x, kTwo)); + const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo))); + return Xor(z, sign); // Reapply the sign bit +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ diff --git a/hwy/contrib/math/math_test.cc b/hwy/contrib/math/math_test.cc new file mode 100644 index 0000000..246a081 --- /dev/null +++ b/hwy/contrib/math/math_test.cc @@ -0,0 +1,227 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include + +#include // FLT_MAX +#include + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/math/math-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +inline Out BitCast(const In& in) { + static_assert(sizeof(Out) == sizeof(In), ""); + Out out; + CopyBytes(&in, &out); + return out; +} + +template +HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T), + Vec (*fxN)(D, VecArg>), D d, T min, T max, + uint64_t max_error_ulp) { + using UintT = MakeUnsigned; + + const UintT min_bits = BitCast(min); + const UintT max_bits = BitCast(max); + + // If min is negative and max is positive, the range needs to be broken into + // two pieces, [+0, max] and [-0, min], otherwise [min, max]. + int range_count = 1; + UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}}; + if ((min < 0.0) && (max > 0.0)) { + ranges[0][0] = BitCast(static_cast(+0.0)); + ranges[0][1] = max_bits; + ranges[1][0] = BitCast(static_cast(-0.0)); + ranges[1][1] = min_bits; + range_count = 2; + } + + uint64_t max_ulp = 0; + // Emulation is slower, so cannot afford as many. + constexpr UintT kSamplesPerRange = static_cast(AdjustedReps(4000)); + for (int range_index = 0; range_index < range_count; ++range_index) { + const UintT start = ranges[range_index][0]; + const UintT stop = ranges[range_index][1]; + const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange)); + for (UintT value_bits = start; value_bits <= stop; value_bits += step) { + // For reasons unknown, the HWY_MAX is necessary on RVV, otherwise + // value_bits can be less than start, and thus possibly NaN. + const T value = BitCast(HWY_MIN(HWY_MAX(start, value_bits), stop)); + const T actual = GetLane(fxN(d, Set(d, value))); + const T expected = fx1(value); + + // Skip small inputs and outputs on armv7, it flushes subnormals to zero. +#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7 + if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) { + continue; + } +#endif + + const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected); + max_ulp = HWY_MAX(max_ulp, ulp); + if (ulp > max_error_ulp) { + fprintf(stderr, + "%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n", + hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value, + expected, actual, static_cast(ulp), + static_cast(max_error_ulp)); + } + } + } + fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n", + hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp); + HWY_ASSERT(max_ulp <= max_error_ulp); +} + +#define DEFINE_MATH_TEST_FUNC(NAME) \ + HWY_NOINLINE void TestAll##NAME() { \ + ForFloatTypes(ForPartialVectors()); \ + } + +#undef DEFINE_MATH_TEST +#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \ + F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \ + struct Test##NAME { \ + template \ + HWY_NOINLINE void operator()(T, D d) { \ + if (sizeof(T) == 4) { \ + TestMath(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \ + F32_ERROR); \ + } else { \ + TestMath(HWY_STR(NAME), F64x1, F64xN, d, \ + static_cast(F64_MIN), static_cast(F64_MAX), \ + F64_ERROR); \ + } \ + } \ + }; \ + DEFINE_MATH_TEST_FUNC(NAME) + +// Floating point values closest to but less than 1.0 +const float kNearOneF = BitCast(0x3F7FFFFF); +const double kNearOneD = BitCast(0x3FEFFFFFFFFFFFFFULL); + +// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so +// only increase the error tolerance there. +constexpr uint64_t Cos64ULP() { +#if defined(__MINGW32__) + return 23; +#else + return 3; +#endif +} + +constexpr uint64_t ACosh32ULP() { +#if defined(__MINGW32__) + return 8; +#else + return 3; +#endif +} + +// clang-format off +DEFINE_MATH_TEST(Acos, + std::acos, CallAcos, -1.0f, +1.0f, 3, // NEON is 3 instead of 2 + std::acos, CallAcos, -1.0, +1.0, 2) +DEFINE_MATH_TEST(Acosh, + std::acosh, CallAcosh, +1.0f, +FLT_MAX, ACosh32ULP(), + std::acosh, CallAcosh, +1.0, +DBL_MAX, 3) +DEFINE_MATH_TEST(Asin, + std::asin, CallAsin, -1.0f, +1.0f, 4, // ARMv7 is 4 instead of 2 + std::asin, CallAsin, -1.0, +1.0, 2) +DEFINE_MATH_TEST(Asinh, + std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3, + std::asinh, CallAsinh, -DBL_MAX, +DBL_MAX, 3) +DEFINE_MATH_TEST(Atan, + std::atan, CallAtan, -FLT_MAX, +FLT_MAX, 3, + std::atan, CallAtan, -DBL_MAX, +DBL_MAX, 3) +DEFINE_MATH_TEST(Atanh, + std::atanh, CallAtanh, -kNearOneF, +kNearOneF, 4, // NEON is 4 instead of 3 + std::atanh, CallAtanh, -kNearOneD, +kNearOneD, 3) +DEFINE_MATH_TEST(Cos, + std::cos, CallCos, -39000.0f, +39000.0f, 3, + std::cos, CallCos, -39000.0, +39000.0, Cos64ULP()) +DEFINE_MATH_TEST(Exp, + std::exp, CallExp, -FLT_MAX, +104.0f, 1, + std::exp, CallExp, -DBL_MAX, +104.0, 1) +DEFINE_MATH_TEST(Expm1, + std::expm1, CallExpm1, -FLT_MAX, +104.0f, 4, + std::expm1, CallExpm1, -DBL_MAX, +104.0, 4) +DEFINE_MATH_TEST(Log, + std::log, CallLog, +FLT_MIN, +FLT_MAX, 1, + std::log, CallLog, +DBL_MIN, +DBL_MAX, 1) +DEFINE_MATH_TEST(Log10, + std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2, + std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2) +DEFINE_MATH_TEST(Log1p, + std::log1p, CallLog1p, +0.0f, +1e37f, 3, // NEON is 3 instead of 2 + std::log1p, CallLog1p, +0.0, +DBL_MAX, 2) +DEFINE_MATH_TEST(Log2, + std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2, + std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2) +DEFINE_MATH_TEST(Sin, + std::sin, CallSin, -39000.0f, +39000.0f, 3, + std::sin, CallSin, -39000.0, +39000.0, 4) // MSYS is 4 instead of 3 +DEFINE_MATH_TEST(Sinh, + std::sinh, CallSinh, -80.0f, +80.0f, 4, + std::sinh, CallSinh, -709.0, +709.0, 4) +DEFINE_MATH_TEST(Tanh, + std::tanh, CallTanh, -FLT_MAX, +FLT_MAX, 4, + std::tanh, CallTanh, -DBL_MAX, +DBL_MAX, 4) +// clang-format on + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMathTest); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh); +} // namespace hwy + +#endif diff --git a/hwy/contrib/sort/BUILD b/hwy/contrib/sort/BUILD new file mode 100644 index 0000000..3f56d6d --- /dev/null +++ b/hwy/contrib/sort/BUILD @@ -0,0 +1,190 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +# Unused on Bazel builds, where this is not defined/known; Copybara replaces +# usages with an empty list. +COMPAT = [ + "//buildenv/target:non_prod", # includes mobile/vendor. +] + +# cc_library( +# name = "vxsort", +# srcs = [ +# "vxsort/isa_detection.cpp", +# "vxsort/isa_detection_msvc.cpp", +# "vxsort/isa_detection_sane.cpp", +# "vxsort/machine_traits.avx2.cpp", +# "vxsort/smallsort/avx2_load_mask_tables.cpp", +# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp", +# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp", +# "vxsort/vxsort_stats.cpp", +# ], +# hdrs = [ +# "vxsort/alignment.h", +# "vxsort/defs.h", +# "vxsort/isa_detection.h", +# "vxsort/machine_traits.avx2.h", +# "vxsort/machine_traits.avx512.h", +# "vxsort/machine_traits.h", +# "vxsort/packer.h", +# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h", +# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h", +# "vxsort/smallsort/bitonic_sort.h", +# "vxsort/vxsort.h", +# "vxsort/vxsort_stats.h", +# ], +# compatible_with = [], +# textual_hdrs = [ +# "vxsort/vxsort_targets_disable.h", +# "vxsort/vxsort_targets_enable_avx2.h", +# "vxsort/vxsort_targets_enable_avx512.h", +# ], +# ) + +cc_library( + name = "vqsort", + srcs = [ + # Split into separate files to reduce MSVC build time. + "vqsort.cc", + "vqsort_128a.cc", + "vqsort_128d.cc", + "vqsort_f32a.cc", + "vqsort_f32d.cc", + "vqsort_f64a.cc", + "vqsort_f64d.cc", + "vqsort_i16a.cc", + "vqsort_i16d.cc", + "vqsort_i32a.cc", + "vqsort_i32d.cc", + "vqsort_i64a.cc", + "vqsort_i64d.cc", + "vqsort_kv64a.cc", + "vqsort_kv64d.cc", + "vqsort_kv128a.cc", + "vqsort_kv128d.cc", + "vqsort_u16a.cc", + "vqsort_u16d.cc", + "vqsort_u32a.cc", + "vqsort_u32d.cc", + "vqsort_u64a.cc", + "vqsort_u64d.cc", + ], + hdrs = [ + "vqsort.h", # public interface + ], + compatible_with = [], + local_defines = ["hwy_contrib_EXPORTS"], + textual_hdrs = [ + "shared-inl.h", + "sorting_networks-inl.h", + "traits-inl.h", + "traits128-inl.h", + "vqsort-inl.h", + # Placeholder for internal instrumentation. Do not remove. + ], + deps = [ + # Only if VQSORT_SECURE_RNG is set. + # "//third_party/absl/random", + "//:hwy", + # ":vxsort", # required if HAVE_VXSORT + ], +) + +# ----------------------------------------------------------------------------- +# Internal-only targets + +cc_library( + name = "helpers", + testonly = 1, + textual_hdrs = [ + "algo-inl.h", + "result-inl.h", + ], + deps = [ + ":vqsort", + "//:nanobenchmark", + # Required for HAVE_PDQSORT, but that is unused and this is + # unavailable to Bazel builds, hence commented out. + # "//third_party/boost/allowed", + # Avoid ips4o and thus TBB to work around hwloc build failure. + ], +) + +cc_binary( + name = "print_network", + testonly = 1, + srcs = ["print_network.cc"], + deps = [ + ":helpers", + ":vqsort", + "//:hwy", + ], +) + +cc_test( + name = "sort_test", + size = "medium", + srcs = ["sort_test.cc"], + # Do not enable fully_static_link (pthread crash on bazel) + local_defines = ["HWY_IS_TEST"], + # for test_suite. + tags = ["hwy_ops_test"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) + +cc_binary( + name = "bench_sort", + testonly = 1, + srcs = ["bench_sort.cc"], + # Do not enable fully_static_link (pthread crash on bazel) + local_defines = ["HWY_IS_TEST"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) + +cc_binary( + name = "bench_parallel", + testonly = 1, + srcs = ["bench_parallel.cc"], + # Do not enable fully_static_link (pthread crash on bazel) + local_defines = ["HWY_IS_TEST"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) diff --git a/hwy/contrib/sort/README.md b/hwy/contrib/sort/README.md new file mode 100644 index 0000000..a005141 --- /dev/null +++ b/hwy/contrib/sort/README.md @@ -0,0 +1,87 @@ +# Vectorized and performance-portable Quicksort + +## Introduction + +As of 2022-06-07 this sorts large arrays of built-in types about ten times as +fast as `std::sort`. See also our +[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html) +and [paper](https://arxiv.org/abs/2205.05982). + +## Instructions + +Here are instructions for reproducing our results on Linux and AWS (SVE, NEON). + +### Linux + +Please first ensure golang, and Clang (tested with 13.0.1) are installed via +your system's package manager. + +``` +go install github.com/bazelbuild/bazelisk@latest +git clone https://github.com/google/highway +cd highway +CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all +bazel-bin/hwy/contrib/sort/sort_test +bazel-bin/hwy/contrib/sort/bench_sort +``` + +### AWS Graviton3 + +Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is +32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the +config is verified, then re-launch. See IPv4 hostname in list of instances. + +`ssh -i /path/key.pem ec2-user@hostname` + +Note that the AWS CMake package is too old for llvm, so we build it first: +``` +wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz +tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/ +./bootstrap -- -DCMAKE_USE_OPENSSL=OFF +make -j8 && sudo make install +cd .. +``` + +AWS clang is at version 11.1, which generates unnecessary `AND` instructions +which slow down the sort by 1.15x. We tested with clang trunk as of June 13 +(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build: + +``` +git clone --depth 1 https://github.com/llvm/llvm-project.git +cd llvm-project +mkdir -p build && cd build +/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release +make -j32 && sudo make install +``` + +``` +sudo yum install go +go install github.com/bazelbuild/bazelisk@latest +git clone https://github.com/google/highway +cd highway +CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all +bazel-bin/hwy/contrib/sort/sort_test +bazel-bin/hwy/contrib/sort/bench_sort +``` + +The above command line enables SVE, which is currently only available on +Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by +changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that +such flags will be unnecessary once Clang supports `#pragma target` for NEON and +SVE intrinsics, as it does for x86. + +## Results + +`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort +algorithm (std for `std::sort`, vq for our vqsort), the type of keys being +sorted (f32 is float), the distribution of keys (uniform32 for uniform random +with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e. +number of key bytes output per second). + +Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz: + +``` +[ RUN ] BenchSortGroup/BenchSort.BenchAllSort/AVX3 + AVX3: std: f32: uniform32: 1.00E+06 54 MB/s ( 1 threads) + AVX3: vq: f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads) +``` diff --git a/hwy/contrib/sort/algo-inl.h b/hwy/contrib/sort/algo-inl.h new file mode 100644 index 0000000..4b01e2d --- /dev/null +++ b/hwy/contrib/sort/algo-inl.h @@ -0,0 +1,512 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ + +#include +#include // memcpy + +#include +#include // std::abs +#include + +#include "hwy/base.h" +#include "hwy/contrib/sort/vqsort.h" + +// Third-party algorithms +#define HAVE_AVX2SORT 0 +#define HAVE_IPS4O 0 +// When enabling, consider changing max_threads (required for Table 1a) +#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1) +#define HAVE_PDQSORT 0 +#define HAVE_SORT512 0 +#define HAVE_VXSORT 0 + +#if HAVE_AVX2SORT +HWY_PUSH_ATTRIBUTES("avx2,avx") +#include "avx2sort.h" //NOLINT +HWY_POP_ATTRIBUTES +#endif +#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O +#include "third_party/ips4o/include/ips4o.hpp" +#include "third_party/ips4o/include/ips4o/thread_pool.hpp" +#endif +#if HAVE_PDQSORT +#include "third_party/boost/allowed/sort/sort.hpp" +#endif +#if HAVE_SORT512 +#include "sort512.h" //NOLINT +#endif + +// vxsort is difficult to compile for multiple targets because it also uses +// .cpp files, and we'd also have to #undef its include guards. Instead, compile +// only for AVX2 or AVX3 depending on this macro. +#define VXSORT_AVX3 1 +#if HAVE_VXSORT +// inlined from vxsort_targets_enable_avx512 (must close before end of header) +#ifdef __GNUC__ +#ifdef __clang__ +#if VXSORT_AVX3 +#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \ + apply_to = any(function)) +#else +#pragma clang attribute push(__attribute__((target("avx2"))), \ + apply_to = any(function)) +#endif // VXSORT_AVX3 + +#else +#pragma GCC push_options +#if VXSORT_AVX3 +#pragma GCC target("avx512f,avx512dq") +#else +#pragma GCC target("avx2") +#endif // VXSORT_AVX3 +#endif +#endif + +#if VXSORT_AVX3 +#include "vxsort/machine_traits.avx512.h" +#else +#include "vxsort/machine_traits.avx2.h" +#endif // VXSORT_AVX3 +#include "vxsort/vxsort.h" +#ifdef __GNUC__ +#ifdef __clang__ +#pragma clang attribute pop +#else +#pragma GCC pop_options +#endif +#endif +#endif // HAVE_VXSORT + +namespace hwy { + +enum class Dist { kUniform8, kUniform16, kUniform32 }; + +static inline std::vector AllDist() { + return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32}; +} + +static inline const char* DistName(Dist dist) { + switch (dist) { + case Dist::kUniform8: + return "uniform8"; + case Dist::kUniform16: + return "uniform16"; + case Dist::kUniform32: + return "uniform32"; + } + return "unreachable"; +} + +template +class InputStats { + public: + void Notify(T value) { + min_ = std::min(min_, value); + max_ = std::max(max_, value); + // Converting to integer would truncate floats, multiplying to save digits + // risks overflow especially when casting, so instead take the sum of the + // bit representations as the checksum. + uint64_t bits = 0; + static_assert(sizeof(T) <= 8, "Expected a built-in type"); + CopyBytes(&value, &bits); // not same size + sum_ += bits; + count_ += 1; + } + + bool operator==(const InputStats& other) const { + if (count_ != other.count_) { + HWY_ABORT("count %d vs %d\n", static_cast(count_), + static_cast(other.count_)); + } + + if (min_ != other.min_ || max_ != other.max_) { + HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast(min_), + static_cast(max_), static_cast(other.min_), + static_cast(other.max_)); + } + + // Sum helps detect duplicated/lost values + if (sum_ != other.sum_) { + HWY_ABORT("Sum mismatch %g %g; min %g max %g\n", + static_cast(sum_), static_cast(other.sum_), + static_cast(min_), static_cast(max_)); + } + + return true; + } + + private: + T min_ = hwy::HighestValue(); + T max_ = hwy::LowestValue(); + uint64_t sum_ = 0; + size_t count_ = 0; +}; + +enum class Algo { +#if HAVE_AVX2SORT + kSEA, +#endif +#if HAVE_IPS4O + kIPS4O, +#endif +#if HAVE_PARALLEL_IPS4O + kParallelIPS4O, +#endif +#if HAVE_PDQSORT + kPDQ, +#endif +#if HAVE_SORT512 + kSort512, +#endif +#if HAVE_VXSORT + kVXSort, +#endif + kStd, + kVQSort, + kHeap, +}; + +static inline const char* AlgoName(Algo algo) { + switch (algo) { +#if HAVE_AVX2SORT + case Algo::kSEA: + return "sea"; +#endif +#if HAVE_IPS4O + case Algo::kIPS4O: + return "ips4o"; +#endif +#if HAVE_PARALLEL_IPS4O + case Algo::kParallelIPS4O: + return "par_ips4o"; +#endif +#if HAVE_PDQSORT + case Algo::kPDQ: + return "pdq"; +#endif +#if HAVE_SORT512 + case Algo::kSort512: + return "sort512"; +#endif +#if HAVE_VXSORT + case Algo::kVXSort: + return "vxsort"; +#endif + case Algo::kStd: + return "std"; + case Algo::kVQSort: + return "vq"; + case Algo::kHeap: + return "heap"; + } + return "unreachable"; +} + +} // namespace hwy +#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#endif + +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +class Xorshift128Plus { + static HWY_INLINE uint64_t SplitMix64(uint64_t z) { + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + public: + // Generates two vectors of 64-bit seeds via SplitMix64 and stores into + // `seeds`. Generating these afresh in each ChoosePivot is too expensive. + template + static void GenerateSeeds(DU64 du64, TFromD* HWY_RESTRICT seeds) { + seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull); + for (size_t i = 1; i < 2 * Lanes(du64); ++i) { + seeds[i] = SplitMix64(seeds[i - 1]); + } + } + + // Need to pass in the state because vector cannot be class members. + template + static VU64 RandomBits(VU64& state0, VU64& state1) { + VU64 s1 = state0; + VU64 s0 = state1; + const VU64 bits = Add(s1, s0); + state0 = s0; + s1 = Xor(s1, ShiftLeft<23>(s1)); + state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0)))); + return bits; + } +}; + +template +Vec RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) { + const VU64 bits = Xorshift128Plus::RandomBits(s0, s1); + return BitCast(d, And(bits, mask)); +} + +// It is important to avoid denormals, which are flushed to zero by SIMD but not +// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD. +template +Vec RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) { + using TF = TFromD; + const RebindToUnsigned du; + using VU = Vec; + + const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask); + +#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types + using TU = MakeUnsigned; + const VU bits = Set(du, static_cast(GetLane(bits64) & LimitsMax())); +#else + const VU bits = BitCast(du, bits64); +#endif + // Avoid NaN/denormal by only generating values in [1, 2), i.e. random + // mantissas with the exponent taken from the representation of 1.0. + const VU k1 = BitCast(du, Set(df, TF{1.0})); + const VU mantissa_mask = Set(du, MantissaMask()); + const VU representation = OrAnd(k1, bits, mantissa_mask); + return BitCast(df, representation); +} + +template +Vec MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) { + switch (sizeof_t) { + case 2: + return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull + : 0xFFFFFFFFFFFFFFFFull); + case 4: + return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull + : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull + : 0xFFFFFFFFFFFFFFFFull); + case 8: + return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull + : (dist == Dist::kUniform16) ? 0x000000000000FFFFull + : 0x00000000FFFFFFFFull); + default: + HWY_ABORT("Logic error"); + return Zero(du64); + } +} + +template +InputStats GenerateInput(const Dist dist, T* v, size_t num) { + SortTag du64; + using VU64 = Vec; + const size_t N64 = Lanes(du64); + auto seeds = hwy::AllocateAligned(2 * N64); + Xorshift128Plus::GenerateSeeds(du64, seeds.get()); + VU64 s0 = Load(du64, seeds.get()); + VU64 s1 = Load(du64, seeds.get() + N64); + +#if HWY_TARGET == HWY_SCALAR + const Sisd d; +#else + const Repartition d; +#endif + using V = Vec; + const size_t N = Lanes(d); + const VU64 mask = MaskForDist(du64, dist, sizeof(T)); + auto buf = hwy::AllocateAligned(N); + + size_t i = 0; + for (; i + N <= num; i += N) { + const V values = RandomValues(d, s0, s1, mask); + StoreU(values, d, v + i); + } + if (i < num) { + const V values = RandomValues(d, s0, s1, mask); + StoreU(values, d, buf.get()); + memcpy(v + i, buf.get(), (num - i) * sizeof(T)); + } + + InputStats input_stats; + for (size_t i = 0; i < num; ++i) { + input_stats.Notify(v[i]); + } + return input_stats; +} + +struct ThreadLocal { + Sorter sorter; +}; + +struct SharedState { +#if HAVE_PARALLEL_IPS4O + const unsigned max_threads = hwy::LimitsMax(); // 16 for Table 1a + ips4o::StdThreadPool pool{static_cast( + HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))}; +#endif + std::vector tls{1}; +}; + +// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For +// non-128-bit keys they are the same: +template +void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) { + using detail::TraitsLane; + using detail::SharedTraits; + if (Order().IsAscending()) { + const SharedTraits>> st; + return detail::HeapSort(st, keys, num_keys); + } else { + const SharedTraits>> st; + return detail::HeapSort(st, keys, num_keys); + } +} + +#if VQSORT_ENABLED +template +void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) { + using detail::SharedTraits; + using detail::Traits128; + uint64_t* lanes = reinterpret_cast(keys); + const size_t num_lanes = num_keys * 2; + if (Order().IsAscending()) { + const SharedTraits> st; + return detail::HeapSort(st, lanes, num_lanes); + } else { + const SharedTraits> st; + return detail::HeapSort(st, lanes, num_lanes); + } +} + +template +void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) { + using detail::SharedTraits; + using detail::Traits128; + uint64_t* lanes = reinterpret_cast(keys); + const size_t num_lanes = num_keys * 2; + if (Order().IsAscending()) { + const SharedTraits> st; + return detail::HeapSort(st, lanes, num_lanes); + } else { + const SharedTraits> st; + return detail::HeapSort(st, lanes, num_lanes); + } +} +#endif // VQSORT_ENABLED + +template +void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num, + SharedState& shared, size_t thread) { + const std::less less; + const std::greater greater; + + switch (algo) { +#if HAVE_AVX2SORT + case Algo::kSEA: + return avx2::quicksort(inout, static_cast(num)); +#endif + +#if HAVE_IPS4O + case Algo::kIPS4O: + if (Order().IsAscending()) { + return ips4o::sort(inout, inout + num, less); + } else { + return ips4o::sort(inout, inout + num, greater); + } +#endif + +#if HAVE_PARALLEL_IPS4O + case Algo::kParallelIPS4O: + if (Order().IsAscending()) { + return ips4o::parallel::sort(inout, inout + num, less, shared.pool); + } else { + return ips4o::parallel::sort(inout, inout + num, greater, shared.pool); + } +#endif + +#if HAVE_SORT512 + case Algo::kSort512: + HWY_ABORT("not supported"); + // return Sort512::Sort(inout, num); +#endif + +#if HAVE_PDQSORT + case Algo::kPDQ: + if (Order().IsAscending()) { + return boost::sort::pdqsort_branchless(inout, inout + num, less); + } else { + return boost::sort::pdqsort_branchless(inout, inout + num, greater); + } +#endif + +#if HAVE_VXSORT + case Algo::kVXSort: { +#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \ + (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2) + fprintf(stderr, "Do not call for target %s\n", + hwy::TargetName(HWY_TARGET)); + return; +#else +#if VXSORT_AVX3 + vxsort::vxsort vx; +#else + vxsort::vxsort vx; +#endif + if (Order().IsAscending()) { + return vx.sort(inout, inout + num - 1); + } else { + fprintf(stderr, "Skipping VX - does not support descending order\n"); + return; + } +#endif // enabled for this target + } +#endif // HAVE_VXSORT + + case Algo::kStd: + if (Order().IsAscending()) { + return std::sort(inout, inout + num, less); + } else { + return std::sort(inout, inout + num, greater); + } + + case Algo::kVQSort: + return shared.tls[thread].sorter(inout, num, Order()); + + case Algo::kHeap: + return CallHeapSort(inout, num); + + default: + HWY_ABORT("Not implemented"); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE diff --git a/hwy/contrib/sort/bench_parallel.cc b/hwy/contrib/sort/bench_parallel.cc new file mode 100644 index 0000000..1c8c928 --- /dev/null +++ b/hwy/contrib/sort/bench_parallel.cc @@ -0,0 +1,238 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Concurrent, independent sorts for generating more memory traffic and testing +// scalability. + +#include +#include + +#include //NOLINT +#include +#include +#include //NOLINT +#include //NOLINT +#include +#include + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" //NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/aligned_allocator.h" +// Last +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +class ThreadPool { + public: + // Starts the given number of worker threads and blocks until they are ready. + explicit ThreadPool( + const size_t num_threads = std::thread::hardware_concurrency()) + : num_threads_(num_threads) { + HWY_ASSERT(num_threads_ > 0); + threads_.reserve(num_threads_); + for (size_t i = 0; i < num_threads_; ++i) { + threads_.emplace_back(ThreadFunc, this, i); + } + + WorkersReadyBarrier(); + } + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + // Waits for all threads to exit. + ~ThreadPool() { + StartWorkers(kWorkerExit); + + for (std::thread& thread : threads_) { + thread.join(); + } + } + + size_t NumThreads() const { return threads_.size(); } + + template + void RunOnThreads(size_t max_threads, const Func& func) { + task_ = &CallClosure; + data_ = &func; + StartWorkers(max_threads); + WorkersReadyBarrier(); + } + + private: + // After construction and between calls to Run, workers are "ready", i.e. + // waiting on worker_start_cv_. They are "started" by sending a "command" + // and notifying all worker_start_cv_ waiters. (That is why all workers + // must be ready/waiting - otherwise, the notification will not reach all of + // them and the main thread waits in vain for them to report readiness.) + using WorkerCommand = uint64_t; + + static constexpr WorkerCommand kWorkerWait = ~1ULL; + static constexpr WorkerCommand kWorkerExit = ~2ULL; + + // Calls a closure (lambda with captures). + template + static void CallClosure(const void* f, size_t thread) { + (*reinterpret_cast(f))(thread); + } + + void WorkersReadyBarrier() { + std::unique_lock lock(mutex_); + // Typically only a single iteration. + while (workers_ready_ != threads_.size()) { + workers_ready_cv_.wait(lock); + } + workers_ready_ = 0; + + // Safely handle spurious worker wakeups. + worker_start_command_ = kWorkerWait; + } + + // Precondition: all workers are ready. + void StartWorkers(const WorkerCommand worker_command) { + std::unique_lock lock(mutex_); + worker_start_command_ = worker_command; + // Workers will need this lock, so release it before they wake up. + lock.unlock(); + worker_start_cv_.notify_all(); + } + + static void ThreadFunc(ThreadPool* self, size_t thread) { + // Until kWorkerExit command received: + for (;;) { + std::unique_lock lock(self->mutex_); + // Notify main thread that this thread is ready. + if (++self->workers_ready_ == self->num_threads_) { + self->workers_ready_cv_.notify_one(); + } + RESUME_WAIT: + // Wait for a command. + self->worker_start_cv_.wait(lock); + const WorkerCommand command = self->worker_start_command_; + switch (command) { + case kWorkerWait: // spurious wakeup: + goto RESUME_WAIT; // lock still held, avoid incrementing ready. + case kWorkerExit: + return; // exits thread + default: + break; + } + + lock.unlock(); + // Command is the maximum number of threads that should run the task. + HWY_ASSERT(command < self->NumThreads()); + if (thread < command) { + self->task_(self->data_, thread); + } + } + } + + const size_t num_threads_; + + // Unmodified after ctor, but cannot be const because we call thread::join(). + std::vector threads_; + + std::mutex mutex_; // guards both cv and their variables. + std::condition_variable workers_ready_cv_; + size_t workers_ready_ = 0; + std::condition_variable worker_start_cv_; + WorkerCommand worker_start_command_; + + // Written by main thread, read by workers (after mutex lock/unlock). + std::function task_; // points to CallClosure + const void* data_; // points to caller's Func +}; + +template +void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys, + const Algo algo, SharedState& shared, size_t thread) { + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + using Order = typename Traits::Order; + const size_t num_lanes = num_keys * st.LanesPerKey(); + auto aligned = hwy::AllocateAligned(num_lanes); + + (void)GenerateInput(dist, aligned.get(), num_lanes); + + const Timestamp t0; + Run(algo, reinterpret_cast(aligned.get()), num_keys, shared, + thread); + HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]); +} + +void BenchParallel() { + // Not interested in benchmark results for other targets on x86 + if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) { + return; + } + + ThreadPool pool; + const size_t NT = pool.NumThreads(); + + detail::SharedTraits>> st; + using KeyType = typename decltype(st)::KeyType; + const size_t num_keys = size_t{100} * 1000 * 1000; + +#if HAVE_IPS4O + const Algo algo = Algo::kIPS4O; +#else + const Algo algo = Algo::kVQSort; +#endif + const Dist dist = Dist::kUniform32; + + SharedState shared; + shared.tls.resize(NT); + + std::vector results; + for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) { + Timestamp t0; + // Default capture because MSVC wants algo/dist but clang does not. + pool.RunOnThreads(nt, [=, &shared](size_t thread) { + RunWithoutVerify(st, dist, num_keys, algo, shared, thread); + }); + const double sec = SecondsSince(t0); + results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType), + st.KeyString()); + results.back().Print(); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +namespace { +HWY_BEFORE_TEST(BenchParallel); +HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/bench_sort.cc b/hwy/contrib/sort/bench_sort.cc new file mode 100644 index 0000000..a668fde --- /dev/null +++ b/hwy/contrib/sort/bench_sort.cc @@ -0,0 +1,310 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// Mode for larger sorts because M1 is able to access more than the per-core +// share of L2, so 1M elements might still be in cache. +#define SORT_100M 0 + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +// Defined within HWY_ONCE, used by BenchAllSort. +extern int64_t first_sort_target; + +namespace HWY_NAMESPACE { +namespace { +using detail::TraitsLane; +using detail::OrderAscending; +using detail::OrderDescending; +using detail::SharedTraits; + +#if VQSORT_ENABLED || HWY_IDE +using detail::OrderAscending128; +using detail::OrderAscendingKV128; +using detail::Traits128; + +template +HWY_NOINLINE void BenchPartition() { + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const SortTag d; + detail::SharedTraits st; + const Dist dist = Dist::kUniform8; + double sum = 0.0; + + detail::Generator rng(&sum, 123); // for ChoosePivot + + const size_t max_log2 = AdjustedLog2Reps(20); + for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) { + const size_t num_lanes = 1ull << log2; + const size_t num_keys = num_lanes / st.LanesPerKey(); + auto aligned = hwy::AllocateAligned(num_lanes); + auto buf = hwy::AllocateAligned( + HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)), + hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d)))); + + std::vector seconds; + const size_t num_reps = (1ull << (14 - log2 / 2)) * 30; + for (size_t rep = 0; rep < num_reps; ++rep) { + (void)GenerateInput(dist, aligned.get(), num_lanes); + + // The pivot value can influence performance. Do exactly what vqsort will + // do so that the performance (influenced by prefetching and branch + // prediction) is likely to predict the actual performance inside vqsort. + detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng); + detail::SortSamples(d, st, buf.get()); + auto pivot = detail::ChoosePivotByRank(d, st, buf.get()); + + const Timestamp t0; + detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get()); + seconds.push_back(SecondsSince(t0)); + // 'Use' the result to prevent optimizing out the partition. + sum += static_cast(aligned.get()[num_lanes / 2]); + } + + Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds), + sizeof(KeyType), st.KeyString()) + .Print(); + } + HWY_ASSERT(sum != 999999); // Prevent optimizing out +} + +HWY_NOINLINE void BenchAllPartition() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3) { + return; + } + + BenchPartition>>(); + BenchPartition>>(); + BenchPartition>>(); + BenchPartition>(); + // BenchPartition>(); + BenchPartition>(); +} + +template +HWY_NOINLINE void BenchBase(std::vector& results) { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const SortTag d; + detail::SharedTraits st; + const Dist dist = Dist::kUniform32; + + const size_t N = Lanes(d); + const size_t num_lanes = SortConstants::BaseCaseNum(N); + const size_t num_keys = num_lanes / st.LanesPerKey(); + auto keys = hwy::AllocateAligned(num_lanes); + auto buf = hwy::AllocateAligned(num_lanes + N); + + std::vector seconds; + double sum = 0; // prevents elision + constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure + + for (size_t rep = 0; rep < 30; ++rep) { + InputStats input_stats = + GenerateInput(dist, keys.get(), num_lanes); + + const Timestamp t0; + for (size_t i = 0; i < kMul; ++i) { + detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes, + buf.get()); + sum += static_cast(keys[0]); + } + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase")); + } + HWY_ASSERT(sum < 1E99); + results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1, + SummarizeMeasurements(seconds), sizeof(KeyType), + st.KeyString()); +} + +HWY_NOINLINE void BenchAllBase() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3) { + return; + } + + std::vector results; + BenchBase>>(results); + BenchBase>>(results); + BenchBase>(results); + for (const Result& r : results) { + r.Print(); + } +} + +#else +void BenchAllPartition() {} +void BenchAllBase() {} +#endif // VQSORT_ENABLED + +std::vector AlgoForBench() { + return { +#if HAVE_AVX2SORT + Algo::kSEA, +#endif +#if HAVE_PARALLEL_IPS4O + Algo::kParallelIPS4O, +#elif HAVE_IPS4O + Algo::kIPS4O, +#endif +#if HAVE_PDQSORT + Algo::kPDQ, +#endif +#if HAVE_SORT512 + Algo::kSort512, +#endif +// Only include if we're compiling for the target it supports. +#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \ + (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2)) + Algo::kVXSort, +#endif + +#if !HAVE_PARALLEL_IPS4O +#if !SORT_100M + // These are 10-20x slower, but that's OK for the default size when we + // are not testing the parallel nor 100M modes. + Algo::kStd, Algo::kHeap, +#endif + + Algo::kVQSort, // only ~4x slower, but not required for Table 1a +#endif + }; +} + +template +HWY_NOINLINE void BenchSort(size_t num_keys) { + if (first_sort_target == 0) first_sort_target = HWY_TARGET; + + SharedState shared; + detail::SharedTraits st; + using Order = typename Traits::Order; + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const size_t num_lanes = num_keys * st.LanesPerKey(); + auto aligned = hwy::AllocateAligned(num_lanes); + + const size_t reps = num_keys > 1000 * 1000 ? 10 : 30; + + for (Algo algo : AlgoForBench()) { + // Other algorithms don't depend on the vector instructions, so only run + // them for the first target. +#if !HAVE_VXSORT + if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) { + continue; + } +#endif + + for (Dist dist : AllDist()) { + std::vector seconds; + for (size_t rep = 0; rep < reps; ++rep) { + InputStats input_stats = + GenerateInput(dist, aligned.get(), num_lanes); + + const Timestamp t0; + Run(algo, reinterpret_cast(aligned.get()), num_keys, + shared, /*thread=*/0); + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT( + VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort")); + } + Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds), + sizeof(KeyType), st.KeyString()) + .Print(); + } // dist + } // algo +} + +HWY_NOINLINE void BenchAllSort() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + constexpr size_t K = 1000; + constexpr size_t M = K * K; + (void)K; + (void)M; + for (size_t num_keys : { +#if HAVE_PARALLEL_IPS4O || SORT_100M + 100 * M, +#else + 1 * M, +#endif + }) { + BenchSort>>(num_keys); + // BenchSort>>(num_keys); + // BenchSort>>(num_keys); + BenchSort>>(num_keys); + BenchSort>>(num_keys); + // BenchSort>>(num_keys); + // BenchSort>>(num_keys); + // BenchSort>>(num_keys); + +#if !HAVE_VXSORT && VQSORT_ENABLED + BenchSort>(num_keys); + BenchSort>(num_keys); +#endif + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +int64_t first_sort_target = 0; // none run yet +namespace { +HWY_BEFORE_TEST(BenchSort); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/print_network.cc b/hwy/contrib/sort/print_network.cc new file mode 100644 index 0000000..59cfebc --- /dev/null +++ b/hwy/contrib/sort/print_network.cc @@ -0,0 +1,191 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "hwy/base.h" + +// Based on A.7 in "Entwurf und Implementierung vektorisierter +// Sortieralgorithmen" and code by Mark Blacher. +void PrintMergeNetwork16x2() { + for (int i = 8; i < 16; ++i) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1(d, v%x);\n", i, i); + } + printf("\n"); +} + +void PrintMergeNetwork16x4() { + printf("\n"); + + for (int i = 8; i < 16; ++i) { + printf("v%x = st.Reverse4(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1(d, v%x);\n", i, i); + } +} + +void PrintMergeNetwork16x8() { + printf("\n"); + + for (int i = 8; i < 16; ++i) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance2(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1(d, v%x);\n", i, i); + } +} + +void PrintMergeNetwork16x16() { + printf("\n"); + + for (int i = 8; i < 16; ++i) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsReverse16(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance4(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance2(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1(d, v%x);\n", i, i); + } +} + +int main(int argc, char** argv) { + PrintMergeNetwork16x2(); + PrintMergeNetwork16x4(); + PrintMergeNetwork16x8(); + PrintMergeNetwork16x16(); + return 0; +} diff --git a/hwy/contrib/sort/result-inl.h b/hwy/contrib/sort/result-inl.h new file mode 100644 index 0000000..f3d842d --- /dev/null +++ b/hwy/contrib/sort/result-inl.h @@ -0,0 +1,139 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/algo-inl.h" + +// Normal include guard for non-SIMD parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ + +#include + +#include // std::sort +#include + +#include "hwy/base.h" +#include "hwy/nanobenchmark.h" + +namespace hwy { + +struct Timestamp { + Timestamp() { t = platform::Now(); } + double t; +}; + +static inline double SecondsSince(const Timestamp& t0) { + const Timestamp t1; + return t1.t - t0.t; +} + +// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often +// enough for the mode to be reliable). +static inline double SummarizeMeasurements(std::vector& seconds) { + std::sort(seconds.begin(), seconds.end()); + double sum = 0; + int count = 0; + const size_t num = seconds.size(); + for (size_t i = num / 4; i < num / 2; ++i) { + sum += seconds[i]; + count += 1; + } + return sum / count; +} + +} // namespace hwy +#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct Result { + Result() {} + Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads, + double sec, size_t sizeof_key, const std::string& key_name) + : target(HWY_TARGET), + algo(algo), + dist(dist), + num_keys(num_keys), + num_threads(num_threads), + sec(sec), + sizeof_key(sizeof_key), + key_name(key_name) {} + + void Print() const { + const double bytes = static_cast(num_keys) * + static_cast(num_threads) * + static_cast(sizeof_key); + printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n", + hwy::TargetName(target), AlgoName(algo), key_name.c_str(), + DistName(dist), static_cast(num_keys), bytes * 1E-6 / sec, + num_threads); + } + + int64_t target; + Algo algo; + Dist dist; + size_t num_keys = 0; + size_t num_threads = 0; + double sec = 0.0; + size_t sizeof_key = 0; + std::string key_name; +}; + +template +bool VerifySort(Traits st, const InputStats& input_stats, + const LaneType* out, size_t num_lanes, const char* caller) { + constexpr size_t N1 = st.LanesPerKey(); + HWY_ASSERT(num_lanes >= N1); + + InputStats output_stats; + // Ensure it matches the sort order + for (size_t i = 0; i < num_lanes - N1; i += N1) { + output_stats.Notify(out[i]); + if (N1 == 2) output_stats.Notify(out[i + 1]); + // Reverse order instead of checking !Compare1 so we accept equal keys. + if (st.Compare1(out + i + N1, out + i)) { + printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", + caller, static_cast(i), static_cast(num_lanes), + static_cast(N1), static_cast(out[i + 1]), + static_cast(out[i + 0]), + static_cast(out[i + N1 + 1]), + static_cast(out[i + N1])); + HWY_ABORT("%d-bit sort is incorrect\n", + static_cast(sizeof(LaneType) * 8 * N1)); + } + } + output_stats.Notify(out[num_lanes - N1]); + if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]); + + return input_stats == output_stats; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE diff --git a/hwy/contrib/sort/shared-inl.h b/hwy/contrib/sort/shared-inl.h new file mode 100644 index 0000000..ea604ed --- /dev/null +++ b/hwy/contrib/sort/shared-inl.h @@ -0,0 +1,133 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Definitions shared between vqsort-inl and sorting_networks-inl. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +#include "hwy/base.h" + +namespace hwy { + +// Internal constants - these are to avoid magic numbers/literals and cannot be +// changed without also changing the associated code. +struct SortConstants { +// SortingNetwork reshapes its input into a matrix. This is the maximum number +// of *keys* per vector. +#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD + static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow +#else + static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector +#endif + + // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers, + // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase + // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the + // extra logN factor for larger networks (for which only loose upper bounds + // on size are known). + static constexpr size_t kMaxRowsLog2 = 4; + static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2; + + static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) { + return kMaxRows * HWY_MIN(N, kMaxCols); + } + + // Unrolling is important (pipelining and amortizing branch mispredictions); + // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but + // somewhat slower for sorting than 4x. + // + // To change, must also update left + 3 * N etc. in the loop. + static constexpr size_t kPartitionUnroll = 4; + + static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) { + // The main loop reads kPartitionUnroll vectors, and first loads from + // both left and right beforehand, so it requires min = 2 * + // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed + // >= BaseCaseNum), we partition the right side into a buffer. We need + // another vector at the end so CompressStore does not overwrite anything. + return (2 * kPartitionUnroll + 1) * N; + } + + // Chunk := group of keys loaded for sampling a pivot. Matches the typical + // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors + // are larger, use entire vectors to ensure we do not overrun the array. + static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) { + return HWY_MAX(64 / sizeof_t, N); + } + + static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) { + // 3 chunks of medians, 1 chunk of median medians plus two padding vectors. + return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N; + } + + template + static constexpr HWY_INLINE size_t BufNum(size_t N) { + // One extra for padding plus another for full-vector loads. + return HWY_MAX(BaseCaseNum(N) + 2 * N, + HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N))); + } + + template + static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) { + return sizeof(T) * BufNum(vector_size / sizeof(T)); + } +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#endif + +#include "hwy/highway.h" + +// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and +// Arm v7 debug. +#undef VQSORT_ENABLED +#if (HWY_TARGET == HWY_SCALAR) || \ + (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \ + (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) +#define VQSORT_ENABLED 0 +#else +#define VQSORT_ENABLED 1 +#endif + +namespace hwy { +namespace HWY_NAMESPACE { + +// Default tag / vector width selector. +#if HWY_TARGET == HWY_RVV +// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl. +template +using SortTag = ScalableTag; +#else +template +using SortTag = ScalableTag; +#endif + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE diff --git a/hwy/contrib/sort/sort_test.cc b/hwy/contrib/sort/sort_test.cc new file mode 100644 index 0000000..2d1f1d5 --- /dev/null +++ b/hwy/contrib/sort/sort_test.cc @@ -0,0 +1,626 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include +#include +#include // memcpy + +#include +#include + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/sort/vqsort.h" +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +using detail::OrderAscending; +using detail::OrderDescending; +using detail::SharedTraits; +using detail::TraitsLane; +#if VQSORT_ENABLED || HWY_IDE +using detail::OrderAscending128; +using detail::OrderAscendingKV128; +using detail::OrderAscendingKV64; +using detail::OrderDescending128; +using detail::OrderDescendingKV128; +using detail::OrderDescendingKV64; +using detail::Traits128; + +template +static HWY_NOINLINE void TestMedian3() { + using LaneType = typename Traits::LaneType; + using D = CappedTag; + SharedTraits st; + const D d; + using V = Vec; + for (uint32_t bits = 0; bits < 8; ++bits) { + const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u}); + const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u}); + const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u}); + const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2)); + // If at least half(rounded up) of bits are 1, so is the median. + const size_t count = PopCount(bits); + HWY_ASSERT_EQ((count >= 2) ? static_cast(1) : 0, m); + } +} + +HWY_NOINLINE void TestAllMedian() { + TestMedian3 > >(); +} + +template +static HWY_NOINLINE void TestBaseCaseAscDesc() { + using LaneType = typename Traits::LaneType; + SharedTraits st; + const SortTag d; + const size_t N = Lanes(d); + const size_t base_case_num = SortConstants::BaseCaseNum(N); + const size_t N1 = st.LanesPerKey(); + + constexpr int kDebug = 0; + auto aligned_lanes = hwy::AllocateAligned(N + base_case_num + N); + auto buf = hwy::AllocateAligned(base_case_num + 2 * N); + + std::vector lengths; + lengths.push_back(HWY_MAX(1, N1)); + lengths.push_back(3 * N1); + lengths.push_back(base_case_num / 2); + lengths.push_back(base_case_num / 2 + N1); + lengths.push_back(base_case_num - N1); + lengths.push_back(base_case_num); + + std::vector misalignments; + misalignments.push_back(0); + misalignments.push_back(1); + if (N >= 6) misalignments.push_back(N / 2 - 1); + misalignments.push_back(N / 2); + misalignments.push_back(N / 2 + 1); + misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1})); + + for (bool asc : {false, true}) { + for (size_t len : lengths) { + for (size_t misalign : misalignments) { + LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign; + if (kDebug) { + printf("============%s asc %d N1 %d len %d misalign %d\n", + st.KeyString().c_str(), asc, static_cast(N1), + static_cast(len), static_cast(misalign)); + } + + for (size_t i = 0; i < misalign; ++i) { + aligned_lanes[i] = hwy::LowestValue(); + } + InputStats input_stats; + for (size_t i = 0; i < len; ++i) { + lanes[i] = asc ? static_cast(LaneType(i) + 1) + : static_cast(LaneType(len) - LaneType(i)); + input_stats.Notify(lanes[i]); + if (kDebug >= 2) { + printf("%3zu: %f\n", i, static_cast(lanes[i])); + } + } + for (size_t i = len; i < base_case_num + N; ++i) { + lanes[i] = hwy::LowestValue(); + } + + detail::BaseCase(d, st, lanes, lanes + len, len, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = 0; i < len; ++i) { + printf("%3zu: %f\n", i, static_cast(lanes[i])); + } + } + + HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc")); + for (size_t i = 0; i < misalign; ++i) { + if (aligned_lanes[i] != hwy::LowestValue()) + HWY_ABORT("Overrun misalign at %d\n", static_cast(i)); + } + for (size_t i = len; i < base_case_num + N; ++i) { + if (lanes[i] != hwy::LowestValue()) + HWY_ABORT("Overrun right at %d\n", static_cast(i)); + } + } // misalign + } // len + } // asc +} + +template +static HWY_NOINLINE void TestBaseCase01() { + using LaneType = typename Traits::LaneType; + SharedTraits st; + const SortTag d; + const size_t N = Lanes(d); + const size_t base_case_num = SortConstants::BaseCaseNum(N); + const size_t N1 = st.LanesPerKey(); + + constexpr int kDebug = 0; + auto lanes = hwy::AllocateAligned(base_case_num + N); + auto buf = hwy::AllocateAligned(base_case_num + 2 * N); + + std::vector lengths; + lengths.push_back(HWY_MAX(1, N1)); + lengths.push_back(3 * N1); + lengths.push_back(base_case_num / 2); + lengths.push_back(base_case_num / 2 + N1); + lengths.push_back(base_case_num - N1); + lengths.push_back(base_case_num); + + for (size_t len : lengths) { + if (kDebug) { + printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(), + static_cast(N1), static_cast(len)); + } + const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14})); + for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) { + InputStats input_stats; + for (size_t i = 0; i < len; ++i) { + lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0; + input_stats.Notify(lanes[i]); + if (kDebug >= 2) { + printf("%3zu: %f\n", i, static_cast(lanes[i])); + } + } + for (size_t i = len; i < base_case_num + N; ++i) { + lanes[i] = hwy::LowestValue(); + } + + detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = 0; i < len; ++i) { + printf("%3zu: %f\n", i, static_cast(lanes[i])); + } + } + + HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01")); + for (size_t i = len; i < base_case_num + N; ++i) { + if (lanes[i] != hwy::LowestValue()) + HWY_ABORT("Overrun right at %d\n", static_cast(i)); + } + } // bits + } // len +} + +template +static HWY_NOINLINE void TestBaseCase() { + TestBaseCaseAscDesc(); + TestBaseCase01(); +} + +HWY_NOINLINE void TestAllBaseCase() { + // Workaround for stack overflow on MSVC debug. +#if defined(_MSC_VER) + return; +#endif + TestBaseCase > >(); + TestBaseCase > >(); + TestBaseCase >(); + TestBaseCase >(); +} + +template +static HWY_NOINLINE void VerifyPartition( + Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left, + size_t border, size_t right, const size_t N1, + const typename Traits::LaneType* pivot) { + /* for (size_t i = left; i < right; ++i) { + if (i == border) printf("--\n"); + printf("%4zu: %3d\n", i, lanes[i]); + }*/ + + HWY_ASSERT(left % N1 == 0); + HWY_ASSERT(border % N1 == 0); + HWY_ASSERT(right % N1 == 0); + const bool asc = typename Traits::Order().IsAscending(); + for (size_t i = left; i < border; i += N1) { + if (st.Compare1(pivot, lanes + i)) { + HWY_ABORT( + "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f " + "border %d", + st.KeyString().c_str(), asc, static_cast(i), + static_cast(pivot[1]), static_cast(pivot[0]), + static_cast(lanes[i + 1]), static_cast(lanes[i + 0]), + static_cast(border)); + } + } + for (size_t i = border; i < right; i += N1) { + if (!st.Compare1(pivot, lanes + i)) { + HWY_ABORT( + "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f " + "border %d", + st.KeyString().c_str(), asc, static_cast(i), + static_cast(pivot[1]), static_cast(pivot[0]), + static_cast(lanes[i + 1]), static_cast(lanes[i]), + static_cast(border)); + } + } +} + +template +static HWY_NOINLINE void TestPartition() { + using LaneType = typename Traits::LaneType; + const SortTag d; + SharedTraits st; + const bool asc = typename Traits::Order().IsAscending(); + const size_t N = Lanes(d); + constexpr int kDebug = 0; + const size_t base_case_num = SortConstants::BaseCaseNum(N); + // left + len + align + const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N; + auto aligned_lanes = hwy::AllocateAligned(total); + auto buf = hwy::AllocateAligned(SortConstants::PartitionBufNum(N)); + + const size_t N1 = st.LanesPerKey(); + for (bool in_asc : {false, true}) { + for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) { + const size_t left = static_cast(left_i) & ~(N1 - 1); + for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3, + 3 * N - 1, 4 * N - 3, 4 * N - 2}) { + const size_t len = (base_case_num + ofs) & ~(N1 - 1); + for (LaneType pivot1 : + {LaneType(0), LaneType(len / 3), LaneType(len / 2), + LaneType(2 * len / 3), LaneType(len)}) { + const LaneType pivot2[2] = {pivot1, 0}; + const auto pivot = st.SetKey(d, pivot2); + for (size_t misalign = 0; misalign < N; + misalign += st.LanesPerKey()) { + LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign; + const size_t right = left + len; + if (kDebug) { + printf( + "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n", + st.KeyString().c_str(), asc, static_cast(left), + static_cast(len), static_cast(right), + static_cast(pivot2[1]), + static_cast(pivot2[0])); + } + + for (size_t i = 0; i < misalign; ++i) { + aligned_lanes[i] = hwy::LowestValue(); + } + for (size_t i = 0; i < left; ++i) { + lanes[i] = hwy::LowestValue(); + } + std::unordered_map counts; + for (size_t i = left; i < right; ++i) { + lanes[i] = static_cast( + in_asc ? LaneType(i + 1) - static_cast(left) + : static_cast(right) - LaneType(i)); + ++counts[lanes[i]]; + if (kDebug >= 2) { + printf("%3zu: %f\n", i, static_cast(lanes[i])); + } + } + for (size_t i = right; i < total - misalign; ++i) { + lanes[i] = hwy::LowestValue(); + } + + size_t border = + left + detail::Partition(d, st, lanes + left, right - left, + pivot, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = left; i < right; ++i) { + printf("%3zu: %f\n", i, static_cast(lanes[i])); + } + for (size_t i = right; i < total - misalign; ++i) { + printf("%3zu: sentinel %f\n", i, static_cast(lanes[i])); + } + } + for (size_t i = left; i < right; ++i) { + --counts[lanes[i]]; + } + for (auto kv : counts) { + if (kv.second != 0) { + PrintValue(kv.first); + HWY_ABORT("Incorrect count %d\n", kv.second); + } + } + VerifyPartition(st, lanes, left, border, right, N1, pivot2); + for (size_t i = 0; i < misalign; ++i) { + if (aligned_lanes[i] != hwy::LowestValue()) + HWY_ABORT("Overrun misalign at %d\n", static_cast(i)); + } + for (size_t i = 0; i < left; ++i) { + if (lanes[i] != hwy::LowestValue()) + HWY_ABORT("Overrun left at %d\n", static_cast(i)); + } + for (size_t i = right; i < total - misalign; ++i) { + if (lanes[i] != hwy::LowestValue()) + HWY_ABORT("Overrun right at %d\n", static_cast(i)); + } + } // misalign + } // pivot + } // len + } // left + } // asc +} + +HWY_NOINLINE void TestAllPartition() { + TestPartition > >(); + TestPartition >(); + +#if !HWY_IS_DEBUG_BUILD + TestPartition > >(); + TestPartition > >(); + TestPartition > >(); +#if HWY_HAVE_FLOAT64 + TestPartition > >(); +#endif + TestPartition >(); +#endif +} + +// (used for sample selection for choosing a pivot) +template +static HWY_NOINLINE void TestRandomGenerator() { + static_assert(!hwy::IsSigned(), ""); + SortTag du; + const size_t N = Lanes(du); + + detail::Generator rng(&N, N); + + const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two + + for (uint32_t num_blocks = 2; num_blocks < 100000; + num_blocks = 3 * num_blocks / 2) { + // Generate some numbers and ensure all are in range + uint64_t sum = 0; + constexpr size_t kReps = 10000; + for (size_t rep = 0; rep < kReps; ++rep) { + const uint32_t bits = rng() & 0xFFFFFFFF; + const size_t index = detail::RandomChunkIndex(num_blocks, bits); + HWY_ASSERT(((index + 1) * lanes_per_block) <= + num_blocks * lanes_per_block); + + sum += index; + } + + // Also ensure the mean is near the middle of the range + const double expected = (num_blocks - 1) / 2.0; + const double actual = static_cast(sum) / kReps; + HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected); + } +} + +HWY_NOINLINE void TestAllGenerator() { + TestRandomGenerator(); + TestRandomGenerator(); +} + +#else +static void TestAllMedian() {} +static void TestAllBaseCase() {} +static void TestAllPartition() {} +static void TestAllGenerator() {} +#endif // VQSORT_ENABLED + +// Remembers input, and compares results to that of a reference algorithm. +template +class CompareResults { + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + + public: + CompareResults(const LaneType* in, size_t num_lanes) { + copy_.resize(num_lanes); + memcpy(copy_.data(), in, num_lanes * sizeof(LaneType)); + } + + bool Verify(const LaneType* output) { +#if HAVE_PDQSORT + const Algo reference = Algo::kPDQ; +#else + const Algo reference = Algo::kStd; +#endif + SharedState shared; + using Order = typename Traits::Order; + const Traits st; + const size_t num_keys = copy_.size() / st.LanesPerKey(); + Run(reference, reinterpret_cast(copy_.data()), num_keys, + shared, /*thread=*/0); +#if VQSORT_PRINT >= 3 + fprintf(stderr, "\nExpected:\n"); + for (size_t i = 0; i < copy_.size(); ++i) { + PrintValue(copy_[i]); + } + fprintf(stderr, "\n"); +#endif + for (size_t i = 0; i < copy_.size(); ++i) { + if (copy_[i] != output[i]) { + if (sizeof(KeyType) == 16) { + fprintf(stderr, + "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n", + st.KeyString().c_str(), Order().IsAscending(), + static_cast(i), static_cast(copy_.size()), + static_cast(copy_[i]), + static_cast(output[i])); + } else { + fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ", + st.KeyString().c_str(), Order().IsAscending(), + static_cast(i), static_cast(copy_.size())); + PrintValue(copy_[i]); + PrintValue(output[i]); + fprintf(stderr, "\n"); + } + return false; + } + } + return true; + } + + private: + std::vector copy_; +}; + +std::vector AlgoForTest() { + return { +#if HAVE_AVX2SORT + Algo::kSEA, +#endif +#if HAVE_IPS4O + Algo::kIPS4O, +#endif +#if HAVE_PDQSORT + Algo::kPDQ, +#endif +#if HAVE_SORT512 + Algo::kSort512, +#endif + Algo::kHeap, Algo::kVQSort, + }; +} + +template +void TestSort(size_t num_lanes) { +// Workaround for stack overflow on clang-cl (/F 8388608 does not help). +#if defined(_MSC_VER) + return; +#endif + using Order = typename Traits::Order; + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + SharedState shared; + SharedTraits st; + + // Round up to a whole number of keys. + num_lanes += (st.Is128() && (num_lanes & 1)); + const size_t num_keys = num_lanes / st.LanesPerKey(); + + constexpr size_t kMaxMisalign = 16; + auto aligned = + hwy::AllocateAligned(kMaxMisalign + num_lanes + kMaxMisalign); + for (Algo algo : AlgoForTest()) { + for (Dist dist : AllDist()) { + for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()}, + size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) { + LaneType* lanes = aligned.get() + misalign; + + // Set up red zones before/after the keys to sort + for (size_t i = 0; i < misalign; ++i) { + aligned[i] = hwy::LowestValue(); + } + for (size_t i = 0; i < kMaxMisalign; ++i) { + lanes[num_lanes + i] = hwy::HighestValue(); + } +#if HWY_IS_MSAN + __msan_poison(aligned.get(), misalign * sizeof(LaneType)); + __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType)); +#endif + InputStats input_stats = + GenerateInput(dist, lanes, num_lanes); + + CompareResults compare(lanes, num_lanes); + Run(algo, reinterpret_cast(lanes), num_keys, shared, + /*thread=*/0); + HWY_ASSERT(compare.Verify(lanes)); + HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort")); + + // Check red zones +#if HWY_IS_MSAN + __msan_unpoison(aligned.get(), misalign * sizeof(LaneType)); + __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType)); +#endif + for (size_t i = 0; i < misalign; ++i) { + if (aligned[i] != hwy::LowestValue()) + HWY_ABORT("Overrun left at %d\n", static_cast(i)); + } + for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) { + if (lanes[i] != hwy::HighestValue()) + HWY_ABORT("Overrun right at %d\n", static_cast(i)); + } + } // misalign + } // dist + } // algo +} + +void TestAllSort() { + for (int num : {129, 504, 3 * 1000, 34567}) { + const size_t num_lanes = AdjustedReps(static_cast(num)); + TestSort > >(num_lanes); + TestSort > >(num_lanes); + + TestSort > >(num_lanes); + TestSort > >(num_lanes); + + TestSort > >(num_lanes); + TestSort > >(num_lanes); + + // WARNING: for float types, SIMD comparisons will flush denormals to + // zero, causing mismatches with scalar sorts. In this test, we avoid + // generating denormal inputs. + TestSort > >(num_lanes); +#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom + if (Sorter::HaveFloat64()) { + TestSort > >(num_lanes); + } +#endif + +// Our HeapSort does not support 128-bit keys. +#if VQSORT_ENABLED + TestSort >(num_lanes); + TestSort >(num_lanes); + + TestSort >(num_lanes); + TestSort >(num_lanes); + + TestSort >(num_lanes); + TestSort >(num_lanes); +#endif + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +namespace { +HWY_BEFORE_TEST(SortTest); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/sorting_networks-inl.h b/hwy/contrib/sort/sorting_networks-inl.h new file mode 100644 index 0000000..3cc545b --- /dev/null +++ b/hwy/contrib/sort/sorting_networks-inl.h @@ -0,0 +1,695 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#endif + +#include "hwy/contrib/sort/shared-inl.h" // SortConstants +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +#if VQSORT_ENABLED + +using Constants = hwy::SortConstants; + +// ------------------------------ SharedTraits + +// Code shared between all traits. It's unclear whether these can profitably be +// specialized for Lane vs Block, or optimized like SortPairsDistance1 using +// Compare/DupOdd. +template +struct SharedTraits : public Base { + // Conditionally swaps lane 0 with 2, 1 with 3 etc. + template + HWY_INLINE Vec SortPairsDistance2(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->SwapAdjacentPairs(d, v); + base->Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 8 keys. + template + HWY_INLINE Vec SortPairsReverse8(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->ReverseKeys8(d, v); + base->Sort2(d, v, swapped); + return base->OddEvenQuads(d, swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 8 keys. + template + HWY_INLINE Vec SortPairsReverse16(D d, Vec v) const { + const Base* base = static_cast(this); + static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16"); + Vec swapped = base->ReverseKeys(d, v); + base->Sort2(d, v, swapped); + return ConcatUpperLower(d, swapped, v); // 8 = half of the vector + } +}; + +// ------------------------------ Sorting network + +// (Green's irregular) sorting network for independent columns in 16 vectors. +template > +HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + st.Sort2(d, v0, v2); + st.Sort2(d, v1, v3); + st.Sort2(d, v4, v6); + st.Sort2(d, v5, v7); + st.Sort2(d, v8, va); + st.Sort2(d, v9, vb); + st.Sort2(d, vc, ve); + st.Sort2(d, vd, vf); + st.Sort2(d, v0, v4); + st.Sort2(d, v1, v5); + st.Sort2(d, v2, v6); + st.Sort2(d, v3, v7); + st.Sort2(d, v8, vc); + st.Sort2(d, v9, vd); + st.Sort2(d, va, ve); + st.Sort2(d, vb, vf); + st.Sort2(d, v0, v8); + st.Sort2(d, v1, v9); + st.Sort2(d, v2, va); + st.Sort2(d, v3, vb); + st.Sort2(d, v4, vc); + st.Sort2(d, v5, vd); + st.Sort2(d, v6, ve); + st.Sort2(d, v7, vf); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v3, vc); + st.Sort2(d, v7, vb); + st.Sort2(d, vd, ve); + st.Sort2(d, v4, v8); + st.Sort2(d, v1, v2); + st.Sort2(d, v1, v4); + st.Sort2(d, v7, vd); + st.Sort2(d, v2, v8); + st.Sort2(d, vb, ve); + st.Sort2(d, v2, v4); + st.Sort2(d, v5, v6); + st.Sort2(d, v9, va); + st.Sort2(d, vb, vd); + st.Sort2(d, v3, v8); + st.Sort2(d, v7, vc); + st.Sort2(d, v3, v5); + st.Sort2(d, v6, v8); + st.Sort2(d, v7, v9); + st.Sort2(d, va, vc); + st.Sort2(d, v3, v4); + st.Sort2(d, v5, v6); + st.Sort2(d, v7, v8); + st.Sort2(d, v9, va); + st.Sort2(d, vb, vc); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); +} + +// ------------------------------ Merging networks + +// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc. + +template > +HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + v8 = st.ReverseKeys2(d, v8); + v9 = st.ReverseKeys2(d, v9); + va = st.ReverseKeys2(d, va); + vb = st.ReverseKeys2(d, vb); + vc = st.ReverseKeys2(d, vc); + vd = st.ReverseKeys2(d, vd); + ve = st.ReverseKeys2(d, ve); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys2(d, v4); + vc = st.ReverseKeys2(d, vc); + v5 = st.ReverseKeys2(d, v5); + vd = st.ReverseKeys2(d, vd); + v6 = st.ReverseKeys2(d, v6); + ve = st.ReverseKeys2(d, ve); + v7 = st.ReverseKeys2(d, v7); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys2(d, v2); + v3 = st.ReverseKeys2(d, v3); + v6 = st.ReverseKeys2(d, v6); + v7 = st.ReverseKeys2(d, v7); + va = st.ReverseKeys2(d, va); + vb = st.ReverseKeys2(d, vb); + ve = st.ReverseKeys2(d, ve); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys2(d, v1); + v3 = st.ReverseKeys2(d, v3); + v5 = st.ReverseKeys2(d, v5); + v7 = st.ReverseKeys2(d, v7); + v9 = st.ReverseKeys2(d, v9); + vb = st.ReverseKeys2(d, vb); + vd = st.ReverseKeys2(d, vd); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +template > +HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + v8 = st.ReverseKeys4(d, v8); + v9 = st.ReverseKeys4(d, v9); + va = st.ReverseKeys4(d, va); + vb = st.ReverseKeys4(d, vb); + vc = st.ReverseKeys4(d, vc); + vd = st.ReverseKeys4(d, vd); + ve = st.ReverseKeys4(d, ve); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys4(d, v4); + vc = st.ReverseKeys4(d, vc); + v5 = st.ReverseKeys4(d, v5); + vd = st.ReverseKeys4(d, vd); + v6 = st.ReverseKeys4(d, v6); + ve = st.ReverseKeys4(d, ve); + v7 = st.ReverseKeys4(d, v7); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys4(d, v2); + v3 = st.ReverseKeys4(d, v3); + v6 = st.ReverseKeys4(d, v6); + v7 = st.ReverseKeys4(d, v7); + va = st.ReverseKeys4(d, va); + vb = st.ReverseKeys4(d, vb); + ve = st.ReverseKeys4(d, ve); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys4(d, v1); + v3 = st.ReverseKeys4(d, v3); + v5 = st.ReverseKeys4(d, v5); + v7 = st.ReverseKeys4(d, v7); + v9 = st.ReverseKeys4(d, v9); + vb = st.ReverseKeys4(d, vb); + vd = st.ReverseKeys4(d, vd); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsReverse4(d, v0); + v1 = st.SortPairsReverse4(d, v1); + v2 = st.SortPairsReverse4(d, v2); + v3 = st.SortPairsReverse4(d, v3); + v4 = st.SortPairsReverse4(d, v4); + v5 = st.SortPairsReverse4(d, v5); + v6 = st.SortPairsReverse4(d, v6); + v7 = st.SortPairsReverse4(d, v7); + v8 = st.SortPairsReverse4(d, v8); + v9 = st.SortPairsReverse4(d, v9); + va = st.SortPairsReverse4(d, va); + vb = st.SortPairsReverse4(d, vb); + vc = st.SortPairsReverse4(d, vc); + vd = st.SortPairsReverse4(d, vd); + ve = st.SortPairsReverse4(d, ve); + vf = st.SortPairsReverse4(d, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +template > +HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + v8 = st.ReverseKeys8(d, v8); + v9 = st.ReverseKeys8(d, v9); + va = st.ReverseKeys8(d, va); + vb = st.ReverseKeys8(d, vb); + vc = st.ReverseKeys8(d, vc); + vd = st.ReverseKeys8(d, vd); + ve = st.ReverseKeys8(d, ve); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys8(d, v4); + vc = st.ReverseKeys8(d, vc); + v5 = st.ReverseKeys8(d, v5); + vd = st.ReverseKeys8(d, vd); + v6 = st.ReverseKeys8(d, v6); + ve = st.ReverseKeys8(d, ve); + v7 = st.ReverseKeys8(d, v7); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys8(d, v2); + v3 = st.ReverseKeys8(d, v3); + v6 = st.ReverseKeys8(d, v6); + v7 = st.ReverseKeys8(d, v7); + va = st.ReverseKeys8(d, va); + vb = st.ReverseKeys8(d, vb); + ve = st.ReverseKeys8(d, ve); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys8(d, v1); + v3 = st.ReverseKeys8(d, v3); + v5 = st.ReverseKeys8(d, v5); + v7 = st.ReverseKeys8(d, v7); + v9 = st.ReverseKeys8(d, v9); + vb = st.ReverseKeys8(d, vb); + vd = st.ReverseKeys8(d, vd); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsReverse8(d, v0); + v1 = st.SortPairsReverse8(d, v1); + v2 = st.SortPairsReverse8(d, v2); + v3 = st.SortPairsReverse8(d, v3); + v4 = st.SortPairsReverse8(d, v4); + v5 = st.SortPairsReverse8(d, v5); + v6 = st.SortPairsReverse8(d, v6); + v7 = st.SortPairsReverse8(d, v7); + v8 = st.SortPairsReverse8(d, v8); + v9 = st.SortPairsReverse8(d, v9); + va = st.SortPairsReverse8(d, va); + vb = st.SortPairsReverse8(d, vb); + vc = st.SortPairsReverse8(d, vc); + vd = st.SortPairsReverse8(d, vd); + ve = st.SortPairsReverse8(d, ve); + vf = st.SortPairsReverse8(d, vf); + v0 = st.SortPairsDistance2(d, v0); + v1 = st.SortPairsDistance2(d, v1); + v2 = st.SortPairsDistance2(d, v2); + v3 = st.SortPairsDistance2(d, v3); + v4 = st.SortPairsDistance2(d, v4); + v5 = st.SortPairsDistance2(d, v5); + v6 = st.SortPairsDistance2(d, v6); + v7 = st.SortPairsDistance2(d, v7); + v8 = st.SortPairsDistance2(d, v8); + v9 = st.SortPairsDistance2(d, v9); + va = st.SortPairsDistance2(d, va); + vb = st.SortPairsDistance2(d, vb); + vc = st.SortPairsDistance2(d, vc); + vd = st.SortPairsDistance2(d, vd); + ve = st.SortPairsDistance2(d, ve); + vf = st.SortPairsDistance2(d, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +// Unused on MSVC, see below +#if !HWY_COMPILER_MSVC + +template > +HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, + V& vd, V& ve, V& vf) { + v8 = st.ReverseKeys16(d, v8); + v9 = st.ReverseKeys16(d, v9); + va = st.ReverseKeys16(d, va); + vb = st.ReverseKeys16(d, vb); + vc = st.ReverseKeys16(d, vc); + vd = st.ReverseKeys16(d, vd); + ve = st.ReverseKeys16(d, ve); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys16(d, v4); + vc = st.ReverseKeys16(d, vc); + v5 = st.ReverseKeys16(d, v5); + vd = st.ReverseKeys16(d, vd); + v6 = st.ReverseKeys16(d, v6); + ve = st.ReverseKeys16(d, ve); + v7 = st.ReverseKeys16(d, v7); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys16(d, v2); + v3 = st.ReverseKeys16(d, v3); + v6 = st.ReverseKeys16(d, v6); + v7 = st.ReverseKeys16(d, v7); + va = st.ReverseKeys16(d, va); + vb = st.ReverseKeys16(d, vb); + ve = st.ReverseKeys16(d, ve); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys16(d, v1); + v3 = st.ReverseKeys16(d, v3); + v5 = st.ReverseKeys16(d, v5); + v7 = st.ReverseKeys16(d, v7); + v9 = st.ReverseKeys16(d, v9); + vb = st.ReverseKeys16(d, vb); + vd = st.ReverseKeys16(d, vd); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsReverse16(d, v0); + v1 = st.SortPairsReverse16(d, v1); + v2 = st.SortPairsReverse16(d, v2); + v3 = st.SortPairsReverse16(d, v3); + v4 = st.SortPairsReverse16(d, v4); + v5 = st.SortPairsReverse16(d, v5); + v6 = st.SortPairsReverse16(d, v6); + v7 = st.SortPairsReverse16(d, v7); + v8 = st.SortPairsReverse16(d, v8); + v9 = st.SortPairsReverse16(d, v9); + va = st.SortPairsReverse16(d, va); + vb = st.SortPairsReverse16(d, vb); + vc = st.SortPairsReverse16(d, vc); + vd = st.SortPairsReverse16(d, vd); + ve = st.SortPairsReverse16(d, ve); + vf = st.SortPairsReverse16(d, vf); + v0 = st.SortPairsDistance4(d, v0); + v1 = st.SortPairsDistance4(d, v1); + v2 = st.SortPairsDistance4(d, v2); + v3 = st.SortPairsDistance4(d, v3); + v4 = st.SortPairsDistance4(d, v4); + v5 = st.SortPairsDistance4(d, v5); + v6 = st.SortPairsDistance4(d, v6); + v7 = st.SortPairsDistance4(d, v7); + v8 = st.SortPairsDistance4(d, v8); + v9 = st.SortPairsDistance4(d, v9); + va = st.SortPairsDistance4(d, va); + vb = st.SortPairsDistance4(d, vb); + vc = st.SortPairsDistance4(d, vc); + vd = st.SortPairsDistance4(d, vd); + ve = st.SortPairsDistance4(d, ve); + vf = st.SortPairsDistance4(d, vf); + v0 = st.SortPairsDistance2(d, v0); + v1 = st.SortPairsDistance2(d, v1); + v2 = st.SortPairsDistance2(d, v2); + v3 = st.SortPairsDistance2(d, v3); + v4 = st.SortPairsDistance2(d, v4); + v5 = st.SortPairsDistance2(d, v5); + v6 = st.SortPairsDistance2(d, v6); + v7 = st.SortPairsDistance2(d, v7); + v8 = st.SortPairsDistance2(d, v8); + v9 = st.SortPairsDistance2(d, v9); + va = st.SortPairsDistance2(d, va); + vb = st.SortPairsDistance2(d, vb); + vc = st.SortPairsDistance2(d, vc); + vd = st.SortPairsDistance2(d, vd); + ve = st.SortPairsDistance2(d, ve); + vf = st.SortPairsDistance2(d, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +#endif // !HWY_COMPILER_MSVC + +// Reshapes `buf` into a matrix, sorts columns independently, and then merges +// into a sorted 1D array without transposing. +// +// `st` is SharedTraits>. This abstraction layer bridges +// differences in sort order and single-lane vs 128-bit keys. +// `buf` ensures full vectors are aligned, and enables loads/stores without +// bounds checks. +// +// NOINLINE because this is large and called twice from vqsort-inl.h. +// +// References: +// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf +// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h +// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher) +template +HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) { + const CappedTag d; + using V = decltype(Zero(d)); + + HWY_DASSERT(cols <= Constants::kMaxCols); + + // The network width depends on the number of keys, not lanes. + constexpr size_t kLanesPerKey = st.LanesPerKey(); + const size_t keys = cols / kLanesPerKey; + constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey; + + // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr + // offsets to duplicating this code for every value of cols. + static_assert(Constants::kMaxRows == 16, "Update loads/stores/args"); + V v0 = LoadU(d, buf + 0x0 * cols); + V v1 = LoadU(d, buf + 0x1 * cols); + V v2 = LoadU(d, buf + 0x2 * cols); + V v3 = LoadU(d, buf + 0x3 * cols); + V v4 = LoadU(d, buf + 0x4 * cols); + V v5 = LoadU(d, buf + 0x5 * cols); + V v6 = LoadU(d, buf + 0x6 * cols); + V v7 = LoadU(d, buf + 0x7 * cols); + V v8 = LoadU(d, buf + 0x8 * cols); + V v9 = LoadU(d, buf + 0x9 * cols); + V va = LoadU(d, buf + 0xa * cols); + V vb = LoadU(d, buf + 0xb * cols); + V vc = LoadU(d, buf + 0xc * cols); + V vd = LoadU(d, buf + 0xd * cols); + V ve = LoadU(d, buf + 0xe * cols); + V vf = LoadU(d, buf + 0xf * cols); + + Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf); + + // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable + // code paths: if MaxLanes < 2, then keys <= cols < 2. + if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) { + Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, + vf); + + if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) { + Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, + vf); + + if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) { + Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, + ve, vf); + + // Avoids build timeout. Must match #if condition in kMaxCols. +#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD + if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) { + Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, + ve, vf); + + static_assert(Constants::kMaxCols <= 16, "Add more branches"); + } +#endif + } + } + } + + StoreU(v0, d, buf + 0x0 * cols); + StoreU(v1, d, buf + 0x1 * cols); + StoreU(v2, d, buf + 0x2 * cols); + StoreU(v3, d, buf + 0x3 * cols); + StoreU(v4, d, buf + 0x4 * cols); + StoreU(v5, d, buf + 0x5 * cols); + StoreU(v6, d, buf + 0x6 * cols); + StoreU(v7, d, buf + 0x7 * cols); + StoreU(v8, d, buf + 0x8 * cols); + StoreU(v9, d, buf + 0x9 * cols); + StoreU(va, d, buf + 0xa * cols); + StoreU(vb, d, buf + 0xb * cols); + StoreU(vc, d, buf + 0xc * cols); + StoreU(vd, d, buf + 0xd * cols); + StoreU(ve, d, buf + 0xe * cols); + StoreU(vf, d, buf + 0xf * cols); +} + +#else +template +struct SharedTraits : public Base {}; +#endif // VQSORT_ENABLED + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE diff --git a/hwy/contrib/sort/traits-inl.h b/hwy/contrib/sort/traits-inl.h new file mode 100644 index 0000000..8b87c82 --- /dev/null +++ b/hwy/contrib/sort/traits-inl.h @@ -0,0 +1,527 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#endif + +#include + +#include "hwy/contrib/sort/shared-inl.h" // SortConstants +#include "hwy/contrib/sort/vqsort.h" // SortDescending +#include "hwy/highway.h" +#include "hwy/print.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +#if VQSORT_ENABLED || HWY_IDE + +// Highway does not provide a lane type for 128-bit keys, so we use uint64_t +// along with an abstraction layer for single-lane vs. lane-pair, which is +// independent of the order. +template +struct KeyLane { + static constexpr bool Is128() { return false; } + constexpr size_t LanesPerKey() const { return 1; } + + // What type bench_sort should allocate for generating inputs. + using LaneType = T; + // What type to pass to Sorter::operator(). + using KeyType = T; + + std::string KeyString() const { + char string100[100]; + hwy::detail::TypeName(hwy::detail::MakeTypeInfo(), 1, string100); + return string100; + } + + // For HeapSort + HWY_INLINE void Swap(T* a, T* b) const { + const T temp = *a; + *a = *b; + *b = temp; + } + + template + HWY_INLINE V CompressKeys(V keys, M mask) const { + return CompressNot(keys, mask); + } + + // Broadcasts one key into a vector + template + HWY_INLINE Vec SetKey(D d, const T* key) const { + return Set(d, *key); + } + + template + HWY_INLINE Mask EqualKeys(D /*tag*/, Vec a, Vec b) const { + return Eq(a, b); + } + + template + HWY_INLINE Mask NotEqualKeys(D /*tag*/, Vec a, Vec b) const { + return Ne(a, b); + } + + HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; } + + template + HWY_INLINE Vec ReverseKeys(D d, Vec v) const { + return Reverse(d, v); + } + + template + HWY_INLINE Vec ReverseKeys2(D d, Vec v) const { + return Reverse2(d, v); + } + + template + HWY_INLINE Vec ReverseKeys4(D d, Vec v) const { + return Reverse4(d, v); + } + + template + HWY_INLINE Vec ReverseKeys8(D d, Vec v) const { + return Reverse8(d, v); + } + + template + HWY_INLINE Vec ReverseKeys16(D d, Vec v) const { + static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit"); + return ReverseKeys(d, v); + } + + template + HWY_INLINE V OddEvenKeys(const V odd, const V even) const { + return OddEven(odd, even); + } + + template + HWY_INLINE Vec SwapAdjacentPairs(D d, const Vec v) const { + const Repartition du32; + return BitCast(d, Shuffle2301(BitCast(du32, v))); + } + template + HWY_INLINE Vec SwapAdjacentPairs(D /* tag */, const Vec v) const { + return Shuffle1032(v); + } + template + HWY_INLINE Vec SwapAdjacentPairs(D /* tag */, const Vec v) const { + return SwapAdjacentBlocks(v); + } + + template + HWY_INLINE Vec SwapAdjacentQuads(D d, const Vec v) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide dw; +#else + const RepartitionToWide > dw; +#endif + return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v))); + } + template + HWY_INLINE Vec SwapAdjacentQuads(D d, const Vec v) const { + // Assumes max vector size = 512 + return ConcatLowerUpper(d, v, v); + } + + template + HWY_INLINE Vec OddEvenPairs(D d, const Vec odd, + const Vec even) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide dw; +#else + const RepartitionToWide > dw; +#endif + return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even))); + } + template + HWY_INLINE Vec OddEvenPairs(D /* tag */, Vec odd, Vec even) const { + return OddEvenBlocks(odd, even); + } + + template + HWY_INLINE Vec OddEvenQuads(D d, Vec odd, Vec even) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide dw; +#else + const RepartitionToWide > dw; +#endif + return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even))); + } + template + HWY_INLINE Vec OddEvenQuads(D d, Vec odd, Vec even) const { + return ConcatUpperLower(d, odd, even); + } +}; + +// Anything order-related depends on the key traits *and* the order (see +// FirstOfLanes). We cannot implement just one Compare function because Lt128 +// only compiles if the lane type is u64. Thus we need either overloaded +// functions with a tag type, class specializations, or separate classes. +// We avoid overloaded functions because we want all functions to be callable +// from a SortTraits without per-function wrappers. Specializing would work, but +// we are anyway going to specialize at a higher level. +template +struct OrderAscending : public KeyLane { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; } + + template + HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { + return Lt(a, b); + } + + // Two halves of Sort2, used in ScanMinMax. + template + HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { + return Min(a, b); + } + + template + HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { + return Max(a, b); + } + + template + HWY_INLINE Vec FirstOfLanes(D d, Vec v, + T* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template + HWY_INLINE Vec LastOfLanes(D d, Vec v, + T* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::LowestValue()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::HighestValue()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + return Sub(v, Set(d, hwy::Epsilon())); + } +}; + +template +struct OrderDescending : public KeyLane { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; } + + template + HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { + return Lt(b, a); + } + + template + HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { + return Max(a, b); + } + + template + HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { + return Min(a, b); + } + + template + HWY_INLINE Vec FirstOfLanes(D d, Vec v, + T* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template + HWY_INLINE Vec LastOfLanes(D d, Vec v, + T* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::HighestValue()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::LowestValue()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + return Add(v, Set(d, hwy::Epsilon())); + } +}; + +struct OrderAscendingKV64 : public KeyLane { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (*a >> 32) < (*b >> 32); + } + + template + HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { + return Lt(ShiftRight<32>(a), ShiftRight<32>(b)); + } + + // Not required to be stable (preserving the order of equivalent keys), so + // we can include the value in the comparison. + template + HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { + return Min(a, b); + } + + template + HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { + return Max(a, b); + } + + template + HWY_INLINE Vec FirstOfLanes(D d, Vec v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template + HWY_INLINE Vec LastOfLanes(D d, Vec v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + // Same as for regular lanes. + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::LowestValue >()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::HighestValue >()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + return Sub(v, Set(d, 1)); + } +}; + +struct OrderDescendingKV64 : public KeyLane { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (*b >> 32) < (*a >> 32); + } + + template + HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { + return Lt(ShiftRight<32>(b), ShiftRight<32>(a)); + } + + // Not required to be stable (preserving the order of equivalent keys), so + // we can include the value in the comparison. + template + HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { + return Max(a, b); + } + + template + HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { + return Min(a, b); + } + + template + HWY_INLINE Vec FirstOfLanes(D d, Vec v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template + HWY_INLINE Vec LastOfLanes(D d, Vec v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::HighestValue >()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::LowestValue >()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + return Add(v, Set(d, 1)); + } +}; + +// Shared code that depends on Order. +template +struct TraitsLane : public Base { + // For each lane i: replaces a[i] with the first and b[i] with the second + // according to Base. + // Corresponds to a conditional swap, which is one "node" of a sorting + // network. Min/Max are cheaper than compare + blend at least for integers. + template + HWY_INLINE void Sort2(D d, Vec& a, Vec& b) const { + const Base* base = static_cast(this); + + const Vec a_copy = a; + // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4 + // instructions. We can reduce it to a compare + 2 IfThenElse. +#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 + if (sizeof(TFromD) == 8) { + const Mask cmp = base->Compare(d, a, b); + a = IfThenElse(cmp, a, b); + b = IfThenElse(cmp, b, a_copy); + return; + } +#endif + a = base->First(d, a, b); + b = base->Last(d, a_copy, b); + } + + // Conditionally swaps even-numbered lanes with their odd-numbered neighbor. + template + HWY_INLINE Vec SortPairsDistance1(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->ReverseKeys2(d, v); + // Further to the above optimization, Sort2+OddEvenKeys compile to four + // instructions; we can save one by combining two blends. +#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 + const Vec cmp = VecFromMask(d, base->Compare(d, v, swapped)); + return IfVecThenElse(DupOdd(cmp), swapped, v); +#else + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); +#endif + } + + // (See above - we use Sort2 for non-64-bit types.) + template + HWY_INLINE Vec SortPairsDistance1(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->ReverseKeys2(d, v); + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 4 keys. + template + HWY_INLINE Vec SortPairsReverse4(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->ReverseKeys4(d, v); + Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); + } + + // Conditionally swaps lane 0 with 4, 1 with 5 etc. + template + HWY_INLINE Vec SortPairsDistance4(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->SwapAdjacentQuads(d, v); + // Only used in Merge16, so this will not be used on AVX2 (which only has 4 + // u64 lanes), so skip the above optimization for 64-bit AVX2. + Sort2(d, v, swapped); + return base->OddEvenQuads(d, swapped, v); + } +}; + +#else + +// Base class shared between OrderAscending, OrderDescending. +template +struct KeyLane { + constexpr bool Is128() const { return false; } + constexpr size_t LanesPerKey() const { return 1; } + + using LaneType = T; + using KeyType = T; + + std::string KeyString() const { + char string100[100]; + hwy::detail::TypeName(hwy::detail::MakeTypeInfo(), 1, string100); + return string100; + } +}; + +template +struct OrderAscending : public KeyLane { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; } + + template + HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) { + return Lt(a, b); + } +}; + +template +struct OrderDescending : public KeyLane { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; } + + template + HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) { + return Lt(b, a); + } +}; + +template +struct TraitsLane : public Order { + // For HeapSort + template // MSVC doesn't find typename Order::LaneType. + HWY_INLINE void Swap(T* a, T* b) const { + const T temp = *a; + *a = *b; + *b = temp; + } + + template + HWY_INLINE Vec SetKey(D d, const TFromD* key) const { + return Set(d, *key); + } +}; + +#endif // VQSORT_ENABLED + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE diff --git a/hwy/contrib/sort/traits128-inl.h b/hwy/contrib/sort/traits128-inl.h new file mode 100644 index 0000000..c692064 --- /dev/null +++ b/hwy/contrib/sort/traits128-inl.h @@ -0,0 +1,492 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#endif + +#include + +#include "hwy/contrib/sort/shared-inl.h" +#include "hwy/contrib/sort/vqsort.h" // SortDescending +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +#if VQSORT_ENABLED || HWY_IDE + +// Highway does not provide a lane type for 128-bit keys, so we use uint64_t +// along with an abstraction layer for single-lane vs. lane-pair, which is +// independent of the order. +struct KeyAny128 { + static constexpr bool Is128() { return true; } + constexpr size_t LanesPerKey() const { return 2; } + + // What type bench_sort should allocate for generating inputs. + using LaneType = uint64_t; + // KeyType and KeyString are defined by derived classes. + + HWY_INLINE void Swap(LaneType* a, LaneType* b) const { + const FixedTag d; + const auto temp = LoadU(d, a); + StoreU(LoadU(d, b), d, a); + StoreU(temp, d, b); + } + + template + HWY_INLINE V CompressKeys(V keys, M mask) const { + return CompressBlocksNot(keys, mask); + } + + template + HWY_INLINE Vec SetKey(D d, const TFromD* key) const { + return LoadDup128(d, key); + } + + template + HWY_INLINE Vec ReverseKeys(D d, Vec v) const { + return ReverseBlocks(d, v); + } + + template + HWY_INLINE Vec ReverseKeys2(D /* tag */, const Vec v) const { + return SwapAdjacentBlocks(v); + } + + // Only called for 4 keys because we do not support >512-bit vectors. + template + HWY_INLINE Vec ReverseKeys4(D d, const Vec v) const { + HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD)); + return ReverseKeys(d, v); + } + + // Only called for 4 keys because we do not support >512-bit vectors. + template + HWY_INLINE Vec OddEvenPairs(D d, const Vec odd, + const Vec even) const { + HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD)); + return ConcatUpperLower(d, odd, even); + } + + template + HWY_INLINE V OddEvenKeys(const V odd, const V even) const { + return OddEvenBlocks(odd, even); + } + + template + HWY_INLINE Vec ReverseKeys8(D, Vec) const { + HWY_ASSERT(0); // not supported: would require 1024-bit vectors + } + + template + HWY_INLINE Vec ReverseKeys16(D, Vec) const { + HWY_ASSERT(0); // not supported: would require 2048-bit vectors + } + + // This is only called for 8/16 col networks (not supported). + template + HWY_INLINE Vec SwapAdjacentPairs(D, Vec) const { + HWY_ASSERT(0); + } + + // This is only called for 16 col networks (not supported). + template + HWY_INLINE Vec SwapAdjacentQuads(D, Vec) const { + HWY_ASSERT(0); + } + + // This is only called for 8 col networks (not supported). + template + HWY_INLINE Vec OddEvenQuads(D, Vec, Vec) const { + HWY_ASSERT(0); + } +}; + +// Base class shared between OrderAscending128, OrderDescending128. +struct Key128 : public KeyAny128 { + // What type to pass to Sorter::operator(). + using KeyType = hwy::uint128_t; + + std::string KeyString() const { return "U128"; } + + template + HWY_INLINE Mask EqualKeys(D d, Vec a, Vec b) const { + return Eq128(d, a, b); + } + + template + HWY_INLINE Mask NotEqualKeys(D d, Vec a, Vec b) const { + return Ne128(d, a, b); + } + + HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) { + return a[0] == b[0] && a[1] == b[1]; + } +}; + +// Anything order-related depends on the key traits *and* the order (see +// FirstOfLanes). We cannot implement just one Compare function because Lt128 +// only compiles if the lane type is u64. Thus we need either overloaded +// functions with a tag type, class specializations, or separate classes. +// We avoid overloaded functions because we want all functions to be callable +// from a SortTraits without per-function wrappers. Specializing would work, but +// we are anyway going to specialize at a higher level. +struct OrderAscending128 : public Key128 { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1]; + } + + template + HWY_INLINE Mask Compare(D d, Vec a, Vec b) const { + return Lt128(d, a, b); + } + + // Used by CompareTop + template + HWY_INLINE Mask > CompareLanes(V a, V b) const { + return Lt(a, b); + } + + template + HWY_INLINE Vec First(D d, const Vec a, const Vec b) const { + return Min128(d, a, b); + } + + template + HWY_INLINE Vec Last(D d, const Vec a, const Vec b) const { + return Max128(d, a, b); + } + + // Same as for regular lanes because 128-bit lanes are u64. + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::LowestValue >()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::HighestValue >()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + const Vec k0 = Zero(d); + const Vec k1 = OddEven(k0, Set(d, 1)); + const Mask borrow = Eq(v, k0); // don't-care, lo == 0 + // lo == 0? 1 : 0, 0 + const Vec adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1)); + return Sub(Sub(v, k1), adjust); + } +}; + +struct OrderDescending128 : public Key128 { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1]; + } + + template + HWY_INLINE Mask Compare(D d, Vec a, Vec b) const { + return Lt128(d, b, a); + } + + // Used by CompareTop + template + HWY_INLINE Mask > CompareLanes(V a, V b) const { + return Lt(b, a); + } + + template + HWY_INLINE Vec First(D d, const Vec a, const Vec b) const { + return Max128(d, a, b); + } + + template + HWY_INLINE Vec Last(D d, const Vec a, const Vec b) const { + return Min128(d, a, b); + } + + // Same as for regular lanes because 128-bit lanes are u64. + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::HighestValue >()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::LowestValue >()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + const Vec k1 = OddEven(Zero(d), Set(d, 1)); + const Vec added = Add(v, k1); + const Mask overflowed = Lt(added, v); // false, overflowed + // overflowed? 1 : 0, 0 + const Vec adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1)); + return Add(added, adjust); + } +}; + +// Base class shared between OrderAscendingKV128, OrderDescendingKV128. +struct KeyValue128 : public KeyAny128 { + // What type to pass to Sorter::operator(). + using KeyType = K64V64; + + std::string KeyString() const { return "KV128"; } + + template + HWY_INLINE Mask EqualKeys(D d, Vec a, Vec b) const { + return Eq128Upper(d, a, b); + } + + template + HWY_INLINE Mask NotEqualKeys(D d, Vec a, Vec b) const { + return Ne128Upper(d, a, b); + } + + HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) { + return a[1] == b[1]; + } +}; + +struct OrderAscendingKV128 : public KeyValue128 { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return a[1] < b[1]; + } + + template + HWY_INLINE Mask Compare(D d, Vec a, Vec b) const { + return Lt128Upper(d, a, b); + } + + // Used by CompareTop + template + HWY_INLINE Mask > CompareLanes(V a, V b) const { + return Lt(a, b); + } + + template + HWY_INLINE Vec First(D d, const Vec a, const Vec b) const { + return Min128Upper(d, a, b); + } + + template + HWY_INLINE Vec Last(D d, const Vec a, const Vec b) const { + return Max128Upper(d, a, b); + } + + // Same as for regular lanes because 128-bit lanes are u64. + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::LowestValue >()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::HighestValue >()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + const Vec k1 = OddEven(Set(d, 1), Zero(d)); + return Sub(v, k1); + } +}; + +struct OrderDescendingKV128 : public KeyValue128 { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return b[1] < a[1]; + } + + template + HWY_INLINE Mask Compare(D d, Vec a, Vec b) const { + return Lt128Upper(d, b, a); + } + + // Used by CompareTop + template + HWY_INLINE Mask > CompareLanes(V a, V b) const { + return Lt(b, a); + } + + template + HWY_INLINE Vec First(D d, const Vec a, const Vec b) const { + return Max128Upper(d, a, b); + } + + template + HWY_INLINE Vec Last(D d, const Vec a, const Vec b) const { + return Min128Upper(d, a, b); + } + + // Same as for regular lanes because 128-bit lanes are u64. + template + HWY_INLINE Vec FirstValue(D d) const { + return Set(d, hwy::HighestValue >()); + } + + template + HWY_INLINE Vec LastValue(D d) const { + return Set(d, hwy::LowestValue >()); + } + + template + HWY_INLINE Vec PrevValue(D d, Vec v) const { + const Vec k1 = OddEven(Set(d, 1), Zero(d)); + return Add(v, k1); + } +}; + +// Shared code that depends on Order. +template +class Traits128 : public Base { + // Special case for >= 256 bit vectors +#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256 + // Returns vector with only the top u64 lane valid. Useful when the next step + // is to replicate the mask anyway. + template + HWY_INLINE HWY_MAYBE_UNUSED Vec CompareTop(D d, Vec a, Vec b) const { + const Base* base = static_cast(this); + const Mask eqHL = Eq(a, b); + const Vec ltHL = VecFromMask(d, base->CompareLanes(a, b)); +#if HWY_TARGET == HWY_SVE_256 + return IfThenElse(eqHL, DupEven(ltHL), ltHL); +#else + const Vec ltLX = ShiftLeftLanes<1>(ltHL); + return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX); +#endif + } + + // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in + // the most-significant of those lanes (the result of CompareTop), so + // replicate it 4x. Only called for >= 256-bit vectors. + template + HWY_INLINE V ReplicateTop4x(V v) const { +#if HWY_TARGET == HWY_SVE_256 + return svdup_lane_u64(v, 3); +#elif HWY_TARGET <= HWY_AVX3 + return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; +#else // AVX2 + return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; +#endif + } +#endif // HWY_TARGET + + public: + template + HWY_INLINE Vec FirstOfLanes(D d, Vec v, + TFromD* HWY_RESTRICT buf) const { + const Base* base = static_cast(this); + const size_t N = Lanes(d); + Store(v, d, buf); + v = base->SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) { + v = base->First(d, v, base->SetKey(d, buf + i)); + } + return v; + } + + template + HWY_INLINE Vec LastOfLanes(D d, Vec v, + TFromD* HWY_RESTRICT buf) const { + const Base* base = static_cast(this); + const size_t N = Lanes(d); + Store(v, d, buf); + v = base->SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) { + v = base->Last(d, v, base->SetKey(d, buf + i)); + } + return v; + } + + template + HWY_INLINE void Sort2(D d, Vec& a, Vec& b) const { + const Base* base = static_cast(this); + + const Vec a_copy = a; + const auto lt = base->Compare(d, a, b); + a = IfThenElse(lt, a, b); + b = IfThenElse(lt, b, a_copy); + } + + // Conditionally swaps even-numbered lanes with their odd-numbered neighbor. + template + HWY_INLINE Vec SortPairsDistance1(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->ReverseKeys2(d, v); + +#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256 + const Vec select = ReplicateTop4x(CompareTop(d, v, swapped)); + return IfVecThenElse(select, swapped, v); +#else + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); +#endif + } + + // Swaps with the vector formed by reversing contiguous groups of 4 keys. + template + HWY_INLINE Vec SortPairsReverse4(D d, Vec v) const { + const Base* base = static_cast(this); + Vec swapped = base->ReverseKeys4(d, v); + + // Only specialize for AVX3 because this requires 512-bit vectors. +#if HWY_TARGET <= HWY_AVX3 + const Vec512 outHx = CompareTop(d, v, swapped); + // Similar to ReplicateTop4x, we want to gang together 2 comparison results + // (4 lanes). They are not contiguous, so use permute to replicate 4x. + alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7}; + const Vec512 select = + TableLookupLanes(outHx, SetTableIndices(d, kIndices)); + return IfVecThenElse(select, swapped, v); +#else + Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); +#endif + } + + // Conditionally swaps lane 0 with 4, 1 with 5 etc. + template + HWY_INLINE Vec SortPairsDistance4(D, Vec) const { + // Only used by Merge16, which would require 2048 bit vectors (unsupported). + HWY_ASSERT(0); + } +}; + +#endif // VQSORT_ENABLED + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE diff --git a/hwy/contrib/sort/vqsort-inl.h b/hwy/contrib/sort/vqsort-inl.h new file mode 100644 index 0000000..10584d2 --- /dev/null +++ b/hwy/contrib/sort/vqsort-inl.h @@ -0,0 +1,1443 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ + +#ifndef VQSORT_PRINT +#define VQSORT_PRINT 0 +#endif + +// Makes it harder for adversaries to predict our sampling locations, at the +// cost of 1-2% increased runtime. +#ifndef VQSORT_SECURE_RNG +#define VQSORT_SECURE_RNG 0 +#endif + +#if VQSORT_SECURE_RNG +#include "third_party/absl/random/random.h" +#endif + +#include // unconditional #include so we can use if(VQSORT_PRINT). +#include // memcpy + +#include "hwy/cache_control.h" // Prefetch +#include "hwy/contrib/sort/vqsort.h" // Fill24Bytes + +#if HWY_IS_MSAN +#include +#endif + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#endif + +#if VQSORT_PRINT +#include "hwy/print-inl.h" +#endif + +#include "hwy/contrib/sort/shared-inl.h" +#include "hwy/contrib/sort/sorting_networks-inl.h" +// Placeholder for internal instrumentation. Do not remove. +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +using Constants = hwy::SortConstants; + +// Wrappers to avoid #if in user code (interferes with code folding) + +HWY_INLINE void UnpoisonIfMemorySanitizer(void* p, size_t bytes) { +#if HWY_IS_MSAN + __msan_unpoison(p, bytes); +#else + (void)p; + (void)bytes; +#endif +} + +template +HWY_INLINE void MaybePrintVector(D d, const char* label, Vec v, + size_t start = 0, size_t max_lanes = 16) { +#if VQSORT_PRINT >= 2 // Print is only defined #if + Print(d, label, v, start, max_lanes); +#else + (void)d; + (void)label; + (void)v; + (void)start; + (void)max_lanes; +#endif +} + +// ------------------------------ HeapSort + +template +void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes, + size_t start) { + constexpr size_t N1 = st.LanesPerKey(); + const FixedTag d; + + while (start < num_lanes) { + const size_t left = 2 * start + N1; + const size_t right = 2 * start + 2 * N1; + if (left >= num_lanes) break; + size_t idx_larger = start; + const auto key_j = st.SetKey(d, lanes + start); + if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) { + idx_larger = left; + } + if (right < num_lanes && + AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger), + st.SetKey(d, lanes + right)))) { + idx_larger = right; + } + if (idx_larger == start) break; + st.Swap(lanes + start, lanes + idx_larger); + start = idx_larger; + } +} + +// Heapsort: O(1) space, O(N*logN) worst-case comparisons. +// Based on LLVM sanitizer_common.h, licensed under Apache-2.0. +template +void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) { + constexpr size_t N1 = st.LanesPerKey(); + + if (num_lanes < 2 * N1) return; + + // Build heap. + for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) { + SiftDown(st, lanes, num_lanes, i); + } + + for (size_t i = num_lanes - N1; i != 0; i -= N1) { + // Swap root with last + st.Swap(lanes + 0, lanes + i); + + // Sift down the new root. + SiftDown(st, lanes, i, 0); + } +} + +#if VQSORT_ENABLED || HWY_IDE + +// ------------------------------ BaseCase + +// Sorts `keys` within the range [0, num) via sorting network. +template +HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, + T* HWY_RESTRICT keys_end, size_t num, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + using V = decltype(Zero(d)); + + // _Nonzero32 requires num - 1 != 0. + if (HWY_UNLIKELY(num <= 1)) return; + + // Reshape into a matrix with kMaxRows rows, and columns limited by the + // 1D `num`, which is upper-bounded by the vector width (see BaseCaseNum). + const size_t num_pow2 = size_t{1} + << (32 - Num0BitsAboveMS1Bit_Nonzero32( + static_cast(num - 1))); + HWY_DASSERT(num <= num_pow2 && num_pow2 <= Constants::BaseCaseNum(N)); + const size_t cols = + HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2); + HWY_DASSERT(cols <= N); + + // We can avoid padding and load/store directly to `keys` after checking the + // original input array has enough space. Except at the right border, it's OK + // to sort more than the current sub-array. Even if we sort across a previous + // partition point, we know that keys will not migrate across it. However, we + // must use the maximum size of the sorting network, because the StoreU of its + // last vector would otherwise write invalid data starting at kMaxRows * cols. + const size_t N_sn = Lanes(CappedTag()); + if (HWY_LIKELY(keys + N_sn * Constants::kMaxRows <= keys_end)) { + SortingNetwork(st, keys, N_sn); + return; + } + + // Copy `keys` to `buf`. + size_t i; + for (i = 0; i + N <= num; i += N) { + Store(LoadU(d, keys + i), d, buf + i); + } + SafeCopyN(num - i, d, keys + i, buf + i); + i = num; + + // Fill with padding - last in sort order, not copied to keys. + const V kPadding = st.LastValue(d); + // Initialize an extra vector because SortingNetwork loads full vectors, + // which may exceed cols*kMaxRows. + for (; i < (cols * Constants::kMaxRows + N); i += N) { + StoreU(kPadding, d, buf + i); + } + + SortingNetwork(st, buf, cols); + + for (i = 0; i + N <= num; i += N) { + StoreU(Load(d, buf + i), d, keys + i); + } + SafeCopyN(num - i, d, buf + i, keys + i); +} + +// ------------------------------ Partition + +// Consumes from `keys` until a multiple of kUnroll*N remains. +// Temporarily stores the right side into `buf`, then moves behind `num`. +// Returns the number of keys consumed from the left side. +template +HWY_NOINLINE size_t PartitionToMultipleOfUnroll(D d, Traits st, + T* HWY_RESTRICT keys, + size_t& num, const Vec pivot, + T* HWY_RESTRICT buf) { + constexpr size_t kUnroll = Constants::kPartitionUnroll; + const size_t N = Lanes(d); + size_t readL = 0; + T* HWY_RESTRICT posL = keys; + size_t bufR = 0; + // Partition requires both a multiple of kUnroll*N and at least + // 2*kUnroll*N for the initial loads. If less, consume all here. + const size_t num_rem = + (num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1)); + size_t i = 0; + for (; i + N <= num_rem; i += N) { + const Vec vL = LoadU(d, keys + readL); + readL += N; + + const auto comp = st.Compare(d, pivot, vL); + posL += CompressBlendedStore(vL, Not(comp), d, posL); + bufR += CompressStore(vL, comp, d, buf + bufR); + } + // Last iteration: only use valid lanes. + if (HWY_LIKELY(i != num_rem)) { + const auto mask = FirstN(d, num_rem - i); + const Vec vL = LoadU(d, keys + readL); + + const auto comp = st.Compare(d, pivot, vL); + posL += CompressBlendedStore(vL, AndNot(comp, mask), d, posL); + bufR += CompressStore(vL, And(comp, mask), d, buf + bufR); + } + + // MSAN seems not to understand CompressStore. buf[0, bufR) are valid. + UnpoisonIfMemorySanitizer(buf, bufR * sizeof(T)); + + // Everything we loaded was put into buf, or behind the current `posL`, after + // which there is space for bufR items. First move items from `keys + num` to + // `posL` to free up space, then copy `buf` into the vacated `keys + num`. + // A loop with masked loads from `buf` is insufficient - we would also need to + // mask from `keys + num`. Combining a loop with memcpy for the remainders is + // slower than just memcpy, so we use that for simplicity. + num -= bufR; + memcpy(posL, keys + num, bufR * sizeof(T)); + memcpy(keys + num, buf, bufR * sizeof(T)); + return static_cast(posL - keys); // caller will shrink num by this. +} + +template +V OrXor(const V o, const V x1, const V x2) { + // TODO(janwas): add op so we can benefit from AVX-512 ternlog? + return Or(o, Xor(x1, x2)); +} + +// Note: we could track the OrXor of v and pivot to see if the entire left +// partition is equal, but that happens rarely and thus is a net loss. +template +HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec v, + const Vec pivot, T* HWY_RESTRICT keys, + size_t& writeL, size_t& remaining) { + const size_t N = Lanes(d); + + const auto comp = st.Compare(d, pivot, v); + + remaining -= N; + if (hwy::HWY_NAMESPACE::CompressIsPartition::value || + (HWY_MAX_BYTES == 16 && st.Is128())) { + // Non-native Compress (e.g. AVX2): we are able to partition a vector using + // a single Compress+two StoreU instead of two Compress[Blended]Store. The + // latter are more expensive. Because we store entire vectors, the contents + // between the updated writeL and writeR are ignored and will be overwritten + // by subsequent calls. This works because writeL and writeR are at least + // two vectors apart. + const auto lr = st.CompressKeys(v, comp); + const size_t num_left = N - CountTrue(d, comp); + StoreU(lr, d, keys + writeL); + // Now write the right-side elements (if any), such that the previous writeR + // is one past the end of the newly written right elements, then advance. + StoreU(lr, d, keys + remaining + writeL); + writeL += num_left; + } else { + // Native Compress[Store] (e.g. AVX3), which only keep the left or right + // side, not both, hence we require two calls. + const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL); + writeL += num_left; + + (void)CompressBlendedStore(v, comp, d, keys + remaining + writeL); + } +} + +template +HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec v0, + const Vec v1, const Vec v2, + const Vec v3, const Vec pivot, + T* HWY_RESTRICT keys, size_t& writeL, + size_t& remaining) { + StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining); + StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining); + StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining); + StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining); +} + +// Moves "<= pivot" keys to the front, and others to the back. pivot is +// broadcasted. Time-critical! +// +// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports). +template +HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + const Vec pivot, T* HWY_RESTRICT buf) { + using V = decltype(Zero(d)); + const size_t N = Lanes(d); + + // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all + // lanes happen to be in the right-side partition, this will overrun `keys`, + // which triggers asan errors. Avoid by special-casing the last vector. + HWY_DASSERT(num > 2 * N); // ensured by HandleSpecialCases + num -= N; + size_t last = num; + const V vlast = LoadU(d, keys + last); + + const size_t consumedL = + PartitionToMultipleOfUnroll(d, st, keys, num, pivot, buf); + keys += consumedL; + last -= consumedL; + num -= consumedL; + constexpr size_t kUnroll = Constants::kPartitionUnroll; + + // Partition splits the vector into 3 sections, left to right: Elements + // smaller or equal to the pivot, unpartitioned elements and elements larger + // than the pivot. To write elements unconditionally on the loop body without + // overwriting existing data, we maintain two regions of the loop where all + // elements have been copied elsewhere (e.g. vector registers.). I call these + // bufferL and bufferR, for left and right respectively. + // + // These regions are tracked by the indices (writeL, writeR, left, right) as + // presented in the diagram below. + // + // writeL writeR + // \/ \/ + // | <= pivot | bufferL | unpartitioned | bufferR | > pivot | + // \/ \/ + // left right + // + // In the main loop body below we choose a side, load some elements out of the + // vector and move either `left` or `right`. Next we call into StoreLeftRight + // to partition the data, and the partitioned elements will be written either + // to writeR or writeL and the corresponding index will be moved accordingly. + // + // Note that writeR is not explicitly tracked as an optimization for platforms + // with conditional operations. Instead we track writeL and the number of + // elements left to process (`remaining`). From the diagram above we can see + // that: + // writeR - writeL = remaining => writeR = remaining + writeL + // + // Tracking `remaining` is advantageous because each iteration reduces the + // number of unpartitioned elements by a fixed amount, so we can compute + // `remaining` without data dependencies. + // + size_t writeL = 0; + size_t remaining = num; + + const T* HWY_RESTRICT readL = keys; + const T* HWY_RESTRICT readR = keys + num; + // Cannot load if there were fewer than 2 * kUnroll * N. + if (HWY_LIKELY(num != 0)) { + HWY_DASSERT(num >= 2 * kUnroll * N); + HWY_DASSERT((num & (kUnroll * N - 1)) == 0); + + // Make space for writing in-place by reading from readL/readR. + const V vL0 = LoadU(d, readL + 0 * N); + const V vL1 = LoadU(d, readL + 1 * N); + const V vL2 = LoadU(d, readL + 2 * N); + const V vL3 = LoadU(d, readL + 3 * N); + readL += kUnroll * N; + readR -= kUnroll * N; + const V vR0 = LoadU(d, readR + 0 * N); + const V vR1 = LoadU(d, readR + 1 * N); + const V vR2 = LoadU(d, readR + 2 * N); + const V vR3 = LoadU(d, readR + 3 * N); + + // readL/readR changed above, so check again before the loop. + while (readL != readR) { + V v0, v1, v2, v3; + + // Data-dependent but branching is faster than forcing branch-free. + const size_t capacityL = + static_cast((readL - keys) - static_cast(writeL)); + HWY_DASSERT(capacityL <= num); // >= 0 + // Load data from the end of the vector with less data (front or back). + // The next paragraphs explain how this works. + // + // let block_size = (kUnroll * N) + // On the loop prelude we load block_size elements from the front of the + // vector and an additional block_size elements from the back. On each + // iteration k elements are written to the front of the vector and + // (block_size - k) to the back. + // + // This creates a loop invariant where the capacity on the front + // (capacityL) and on the back (capacityR) always add to 2 * block_size. + // In other words: + // capacityL + capacityR = 2 * block_size + // capacityR = 2 * block_size - capacityL + // + // This means that: + // capacityL < capacityR <=> + // capacityL < 2 * block_size - capacityL <=> + // 2 * capacityL < 2 * block_size <=> + // capacityL < block_size + // + // Thus the check on the next line is equivalent to capacityL > capacityR. + // + if (kUnroll * N < capacityL) { + readR -= kUnroll * N; + v0 = LoadU(d, readR + 0 * N); + v1 = LoadU(d, readR + 1 * N); + v2 = LoadU(d, readR + 2 * N); + v3 = LoadU(d, readR + 3 * N); + hwy::Prefetch(readR - 3 * kUnroll * N); + } else { + v0 = LoadU(d, readL + 0 * N); + v1 = LoadU(d, readL + 1 * N); + v2 = LoadU(d, readL + 2 * N); + v3 = LoadU(d, readL + 3 * N); + readL += kUnroll * N; + hwy::Prefetch(readL + 3 * kUnroll * N); + } + + StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining); + } + + // Now finish writing the saved vectors to the middle. + StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining); + StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining); + } + + // We have partitioned [left, right) such that writeL is the boundary. + HWY_DASSERT(remaining == 0); + // Make space for inserting vlast: move up to N of the first right-side keys + // into the unused space starting at last. If we have fewer, ensure they are + // the last items in that vector by subtracting from the *load* address, + // which is safe because we have at least two vectors (checked above). + const size_t totalR = last - writeL; + const size_t startR = totalR < N ? writeL + totalR - N : writeL; + StoreU(LoadU(d, keys + startR), d, keys + last); + + // Partition vlast: write L, then R, into the single-vector gap at writeL. + const auto comp = st.Compare(d, pivot, vlast); + writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL); + (void)CompressBlendedStore(vlast, comp, d, keys + writeL); + + return consumedL + writeL; +} + +// Returns true and partitions if [keys, keys + num) contains only {valueL, +// valueR}. Otherwise, sets third to the first differing value; keys may have +// been reordered and a regular Partition is still necessary. +template +HWY_NOINLINE bool MaybePartitionTwoValue(D d, Traits st, T* HWY_RESTRICT keys, + size_t num, const Vec valueL, + const Vec valueR, Vec& third, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + + size_t i = 0; + size_t writeL = 0; + + // As long as all lanes are equal to L or R, we can overwrite with valueL. + // This is faster than first counting, then backtracking to fill L and R. + for (; i + N <= num; i += N) { + const Vec v = LoadU(d, keys + i); + // It is not clear how to apply OrXor here - that can check if *both* + // comparisons are true, but here we want *either*. Comparing the unsigned + // min of differences to zero works, but is expensive for u64 prior to AVX3. + const Mask eqL = st.EqualKeys(d, v, valueL); + const Mask eqR = st.EqualKeys(d, v, valueR); + // At least one other value present; will require a regular partition. + // On AVX-512, Or + AllTrue are folded into a single kortest if we are + // careful with the FindKnownFirstTrue argument, see below. + if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) { + // If we repeat Or(eqL, eqR) here, the compiler will hoist it into the + // loop, which is a pessimization because this if-true branch is cold. + // We can defeat this via Not(Xor), which is equivalent because eqL and + // eqR cannot be true at the same time. Can we elide the additional Not? + // FindFirstFalse instructions are generally unavailable, but we can + // fuse Not and Xor/Or into one ExclusiveNeither. + const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR)); + third = st.SetKey(d, keys + i + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at vec %zu; writeL %zu\n", i, writeL); + } + // 'Undo' what we did by filling the remainder of what we read with R. + for (; writeL + N <= i; writeL += N) { + StoreU(valueR, d, keys + writeL); + } + BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL); + return false; + } + StoreU(valueL, d, keys + writeL); + writeL += CountTrue(d, eqL); + } + + // Final vector, masked comparison (no effect if i == num) + const size_t remaining = num - i; + SafeCopyN(remaining, d, keys + i, buf); + const Vec v = Load(d, buf); + const Mask valid = FirstN(d, remaining); + const Mask eqL = And(st.EqualKeys(d, v, valueL), valid); + const Mask eqR = st.EqualKeys(d, v, valueR); + // Invalid lanes are considered equal. + const Mask eq = Or(Or(eqL, eqR), Not(valid)); + // At least one other value present; will require a regular partition. + if (HWY_UNLIKELY(!AllTrue(d, eq))) { + const size_t lane = FindKnownFirstTrue(d, Not(eq)); + third = st.SetKey(d, keys + i + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at partial vec %zu; writeL %zu\n", i, + writeL); + } + // 'Undo' what we did by filling the remainder of what we read with R. + for (; writeL + N <= i; writeL += N) { + StoreU(valueR, d, keys + writeL); + } + BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL); + return false; + } + BlendedStore(valueL, valid, d, keys + writeL); + writeL += CountTrue(d, eqL); + + // Fill right side + i = writeL; + for (; i + N <= num; i += N) { + StoreU(valueR, d, keys + i); + } + BlendedStore(valueR, FirstN(d, num - i), d, keys + i); + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Successful MaybePartitionTwoValue\n"); + } + return true; +} + +// Same as above, except that the pivot equals valueR, so scan right to left. +template +HWY_NOINLINE bool MaybePartitionTwoValueR(D d, Traits st, T* HWY_RESTRICT keys, + size_t num, const Vec valueL, + const Vec valueR, Vec& third, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + + HWY_DASSERT(num >= N); + size_t pos = num - N; // current read/write position + size_t countR = 0; // number of valueR found + + // For whole vectors, in descending address order: as long as all lanes are + // equal to L or R, overwrite with valueR. This is faster than counting, then + // filling both L and R. Loop terminates after unsigned wraparound. + for (; pos < num; pos -= N) { + const Vec v = LoadU(d, keys + pos); + // It is not clear how to apply OrXor here - that can check if *both* + // comparisons are true, but here we want *either*. Comparing the unsigned + // min of differences to zero works, but is expensive for u64 prior to AVX3. + const Mask eqL = st.EqualKeys(d, v, valueL); + const Mask eqR = st.EqualKeys(d, v, valueR); + // If there is a third value, stop and undo what we've done. On AVX-512, + // Or + AllTrue are folded into a single kortest, but only if we are + // careful with the FindKnownFirstTrue argument - see prior comment on that. + if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) { + const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR)); + third = st.SetKey(d, keys + pos + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at vec %zu; countR %zu\n", pos, + countR); + MaybePrintVector(d, "third", third, 0, st.LanesPerKey()); + } + pos += N; // rewind: we haven't yet committed changes in this iteration. + // We have filled [pos, num) with R, but only countR of them should have + // been written. Rewrite [pos, num - countR) to L. + HWY_DASSERT(countR <= num - pos); + const size_t endL = num - countR; + for (; pos + N <= endL; pos += N) { + StoreU(valueL, d, keys + pos); + } + BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos); + return false; + } + StoreU(valueR, d, keys + pos); + countR += CountTrue(d, eqR); + } + + // Final partial (or empty) vector, masked comparison. + const size_t remaining = pos + N; + HWY_DASSERT(remaining <= N); + const Vec v = LoadU(d, keys); // Safe because num >= N. + const Mask valid = FirstN(d, remaining); + const Mask eqL = st.EqualKeys(d, v, valueL); + const Mask eqR = And(st.EqualKeys(d, v, valueR), valid); + // Invalid lanes are considered equal. + const Mask eq = Or(Or(eqL, eqR), Not(valid)); + // At least one other value present; will require a regular partition. + if (HWY_UNLIKELY(!AllTrue(d, eq))) { + const size_t lane = FindKnownFirstTrue(d, Not(eq)); + third = st.SetKey(d, keys + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at partial vec %zu; writeR %zu\n", pos, + countR); + MaybePrintVector(d, "third", third, 0, st.LanesPerKey()); + } + pos += N; // rewind: we haven't yet committed changes in this iteration. + // We have filled [pos, num) with R, but only countR of them should have + // been written. Rewrite [pos, num - countR) to L. + HWY_DASSERT(countR <= num - pos); + const size_t endL = num - countR; + for (; pos + N <= endL; pos += N) { + StoreU(valueL, d, keys + pos); + } + BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos); + return false; + } + const size_t lastR = CountTrue(d, eqR); + countR += lastR; + + // First finish writing valueR - [0, N) lanes were not yet written. + StoreU(valueR, d, keys); // Safe because num >= N. + + // Fill left side (ascending order for clarity) + const size_t endL = num - countR; + size_t i = 0; + for (; i + N <= endL; i += N) { + StoreU(valueL, d, keys + i); + } + Store(valueL, d, buf); + SafeCopyN(endL - i, d, buf, keys + i); // avoids asan overrun + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, + "MaybePartitionTwoValueR countR %zu pos %zu i %zu endL %zu\n", + countR, pos, i, endL); + } + + return true; +} + +// `idx_second` is `first_mismatch` from `AllEqual` and thus the index of the +// second key. This is the first path into `MaybePartitionTwoValue`, called +// when all samples are equal. Returns false if there are at least a third +// value and sets `third`. Otherwise, partitions the array and returns true. +template +HWY_INLINE bool PartitionIfTwoKeys(D d, Traits st, const Vec pivot, + T* HWY_RESTRICT keys, size_t num, + const size_t idx_second, const Vec second, + Vec& third, T* HWY_RESTRICT buf) { + // True if second comes before pivot. + const bool is_pivotR = AllFalse(d, st.Compare(d, pivot, second)); + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "Samples all equal, diff at %zu, isPivotR %d\n", idx_second, + is_pivotR); + } + HWY_DASSERT(AllFalse(d, st.EqualKeys(d, second, pivot))); + + // If pivot is R, we scan backwards over the entire array. Otherwise, + // we already scanned up to idx_second and can leave those in place. + return is_pivotR ? MaybePartitionTwoValueR(d, st, keys, num, second, pivot, + third, buf) + : MaybePartitionTwoValue(d, st, keys + idx_second, + num - idx_second, pivot, second, + third, buf); +} + +// Second path into `MaybePartitionTwoValue`, called when not all samples are +// equal. `samples` is sorted. +template +HWY_INLINE bool PartitionIfTwoSamples(D d, Traits st, T* HWY_RESTRICT keys, + size_t num, T* HWY_RESTRICT samples) { + constexpr size_t kSampleLanes = 3 * 64 / sizeof(T); + constexpr size_t N1 = st.LanesPerKey(); + const Vec valueL = st.SetKey(d, samples); + const Vec valueR = st.SetKey(d, samples + kSampleLanes - N1); + HWY_DASSERT(AllTrue(d, st.Compare(d, valueL, valueR))); + HWY_DASSERT(AllFalse(d, st.EqualKeys(d, valueL, valueR))); + const Vec prev = st.PrevValue(d, valueR); + // If the sample has more than two values, then the keys have at least that + // many, and thus this special case is inapplicable. + if (HWY_UNLIKELY(!AllTrue(d, st.EqualKeys(d, valueL, prev)))) { + return false; + } + + // Must not overwrite samples because if this returns false, caller wants to + // read the original samples again. + T* HWY_RESTRICT buf = samples + kSampleLanes; + Vec third; // unused + return MaybePartitionTwoValue(d, st, keys, num, valueL, valueR, third, buf); +} + +// ------------------------------ Pivot sampling + +template +HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) { + const DFromV d; + // Slightly faster for 128-bit, apparently because not serially dependent. + if (st.Is128()) { + // Median = XOR-sum 'minus' the first and last. Calling First twice is + // slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR. + const auto sum = Xor(Xor(v0, v1), v2); + const auto first = st.First(d, st.First(d, v0, v1), v2); + const auto last = st.Last(d, st.Last(d, v0, v1), v2); + return Xor(Xor(sum, first), last); + } + st.Sort2(d, v0, v2); + v1 = st.Last(d, v0, v1); + v1 = st.First(d, v1, v2); + return v1; +} + +#if VQSORT_SECURE_RNG +using Generator = absl::BitGen; +#else +// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028 +#pragma pack(push, 1) +class Generator { + public: + Generator(const void* heap, size_t num) { + Sorter::Fill24Bytes(heap, num, &a_); + k_ = 1; // stream index: must be odd + } + + explicit Generator(uint64_t seed) { + a_ = b_ = w_ = seed; + k_ = 1; + } + + uint64_t operator()() { + const uint64_t b = b_; + w_ += k_; + const uint64_t next = a_ ^ w_; + a_ = (b + (b << 3)) ^ (b >> 11); + const uint64_t rot = (b << 24) | (b >> 40); + b_ = rot + next; + return next; + } + + private: + uint64_t a_; + uint64_t b_; + uint64_t w_; + uint64_t k_; // increment +}; +#pragma pack(pop) + +#endif // !VQSORT_SECURE_RNG + +// Returns slightly biased random index of a chunk in [0, num_chunks). +// See https://www.pcg-random.org/posts/bounded-rands.html. +HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) { + const uint64_t chunk_index = (static_cast(bits) * num_chunks) >> 32; + HWY_DASSERT(chunk_index < num_chunks); + return static_cast(chunk_index); +} + +// Writes samples from `keys[0, num)` into `buf`. +template +HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf, Generator& rng) { + using V = decltype(Zero(d)); + const size_t N = Lanes(d); + + // Power of two + const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N); + + // Align start of keys to chunks. We always have at least 2 chunks because the + // base case would have handled anything up to 16 vectors, i.e. >= 4 chunks. + HWY_DASSERT(num >= 2 * lanes_per_chunk); + const size_t misalign = + (reinterpret_cast(keys) / sizeof(T)) & (lanes_per_chunk - 1); + if (misalign != 0) { + const size_t consume = lanes_per_chunk - misalign; + keys += consume; + num -= consume; + } + + // Generate enough random bits for 9 uint32 + uint64_t* bits64 = reinterpret_cast(buf); + for (size_t i = 0; i < 5; ++i) { + bits64[i] = rng(); + } + const uint32_t* bits = reinterpret_cast(buf); + + const uint32_t lpc32 = static_cast(lanes_per_chunk); + // Avoid division + const size_t log2_lpc = Num0BitsBelowLS1Bit_Nonzero32(lpc32); + const size_t num_chunks64 = num >> log2_lpc; + // Clamp to uint32 for RandomChunkIndex + const uint32_t num_chunks = + static_cast(HWY_MIN(num_chunks64, 0xFFFFFFFFull)); + + const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) << log2_lpc; + const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) << log2_lpc; + const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) << log2_lpc; + const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) << log2_lpc; + const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) << log2_lpc; + const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) << log2_lpc; + const size_t offset6 = RandomChunkIndex(num_chunks, bits[6]) << log2_lpc; + const size_t offset7 = RandomChunkIndex(num_chunks, bits[7]) << log2_lpc; + const size_t offset8 = RandomChunkIndex(num_chunks, bits[8]) << log2_lpc; + for (size_t i = 0; i < lanes_per_chunk; i += N) { + const V v0 = Load(d, keys + offset0 + i); + const V v1 = Load(d, keys + offset1 + i); + const V v2 = Load(d, keys + offset2 + i); + const V medians0 = MedianOf3(st, v0, v1, v2); + Store(medians0, d, buf + i); + + const V v3 = Load(d, keys + offset3 + i); + const V v4 = Load(d, keys + offset4 + i); + const V v5 = Load(d, keys + offset5 + i); + const V medians1 = MedianOf3(st, v3, v4, v5); + Store(medians1, d, buf + i + lanes_per_chunk); + + const V v6 = Load(d, keys + offset6 + i); + const V v7 = Load(d, keys + offset7 + i); + const V v8 = Load(d, keys + offset8 + i); + const V medians2 = MedianOf3(st, v6, v7, v8); + Store(medians2, d, buf + i + lanes_per_chunk * 2); + } +} + +// For detecting inputs where (almost) all keys are equal. +template +HWY_INLINE bool UnsortedSampleEqual(D d, Traits st, + const TFromD* HWY_RESTRICT samples) { + constexpr size_t kSampleLanes = 3 * 64 / sizeof(TFromD); + const size_t N = Lanes(d); + using V = Vec; + + const V first = st.SetKey(d, samples); + // OR of XOR-difference may be faster than comparison. + V diff = Zero(d); + size_t i = 0; + for (; i + N <= kSampleLanes; i += N) { + const V v = Load(d, samples + i); + diff = OrXor(diff, first, v); + } + // Remainder, if any. + const V v = Load(d, samples + i); + const auto valid = FirstN(d, kSampleLanes - i); + diff = IfThenElse(valid, OrXor(diff, first, v), diff); + + // Must avoid floating-point comparisons (for -0) + const RebindToUnsigned du; + return AllTrue(du, Eq(BitCast(du, diff), Zero(du))); +} + +template +HWY_INLINE void SortSamples(D d, Traits st, T* HWY_RESTRICT buf) { + // buf contains 192 bytes, so 16 128-bit vectors are necessary and sufficient. + constexpr size_t kSampleLanes = 3 * 64 / sizeof(T); + const CappedTag d128; + const size_t N128 = Lanes(d128); + constexpr size_t kCols = HWY_MIN(16 / sizeof(T), Constants::kMaxCols); + constexpr size_t kBytes = kCols * Constants::kMaxRows * sizeof(T); + static_assert(192 <= kBytes, ""); + // Fill with padding - last in sort order. + const auto kPadding = st.LastValue(d128); + // Initialize an extra vector because SortingNetwork loads full vectors, + // which may exceed cols*kMaxRows. + for (size_t i = kSampleLanes; i <= kBytes / sizeof(T); i += N128) { + StoreU(kPadding, d128, buf + i); + } + + SortingNetwork(st, buf, kCols); + + if (VQSORT_PRINT >= 2) { + const size_t N = Lanes(d); + fprintf(stderr, "Samples:\n"); + for (size_t i = 0; i < kSampleLanes; i += N) { + MaybePrintVector(d, "", Load(d, buf + i), 0, N); + } + } +} + +// ------------------------------ Pivot selection + +enum class PivotResult { + kDone, // stop without partitioning (all equal, or two-value partition) + kNormal, // partition and recurse left and right + kIsFirst, // partition but skip left recursion + kWasLast, // partition but skip right recursion +}; + +HWY_INLINE const char* PivotResultString(PivotResult result) { + switch (result) { + case PivotResult::kDone: + return "done"; + case PivotResult::kNormal: + return "normal"; + case PivotResult::kIsFirst: + return "first"; + case PivotResult::kWasLast: + return "last"; + } + return "unknown"; +} + +template +HWY_INLINE size_t PivotRank(Traits st, const T* HWY_RESTRICT samples) { + constexpr size_t kSampleLanes = 3 * 64 / sizeof(T); + constexpr size_t N1 = st.LanesPerKey(); + + constexpr size_t kRankMid = kSampleLanes / 2; + static_assert(kRankMid % N1 == 0, "Mid is not an aligned key"); + + // Find the previous value not equal to the median. + size_t rank_prev = kRankMid - N1; + for (; st.Equal1(samples + rank_prev, samples + kRankMid); rank_prev -= N1) { + // All previous samples are equal to the median. + if (rank_prev == 0) return 0; + } + + size_t rank_next = rank_prev + N1; + for (; st.Equal1(samples + rank_next, samples + kRankMid); rank_next += N1) { + // The median is also the largest sample. If it is also the largest key, + // we'd end up with an empty right partition, so choose the previous key. + if (rank_next == kSampleLanes - N1) return rank_prev; + } + + // If we choose the median as pivot, the ratio of keys ending in the left + // partition will likely be rank_next/kSampleLanes (if the sample is + // representative). This is because equal-to-pivot values also land in the + // left - it's infeasible to do an in-place vectorized 3-way partition. + // Check whether prev would lead to a more balanced partition. + const size_t excess_if_median = rank_next - kRankMid; + const size_t excess_if_prev = kRankMid - rank_prev; + return excess_if_median < excess_if_prev ? kRankMid : rank_prev; +} + +// Returns pivot chosen from `samples`. It will never be the largest key +// (thus the right partition will never be empty). +template +HWY_INLINE Vec ChoosePivotByRank(D d, Traits st, + const T* HWY_RESTRICT samples) { + const size_t pivot_rank = PivotRank(st, samples); + const Vec pivot = st.SetKey(d, samples + pivot_rank); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, " Pivot rank %zu = %f\n", pivot_rank, + static_cast(GetLane(pivot))); + } + return pivot; +} + +// Returns true if all keys equal `pivot`, otherwise returns false and sets +// `*first_mismatch' to the index of the first differing key. +template +HWY_NOINLINE bool AllEqual(D d, Traits st, const Vec pivot, + const T* HWY_RESTRICT keys, size_t num, + size_t* HWY_RESTRICT first_mismatch) { + const size_t N = Lanes(d); + // Ensures we can use overlapping loads for the tail; see HandleSpecialCases. + HWY_DASSERT(num >= N); + const Vec zero = Zero(d); + + // Vector-align keys + i. + const size_t misalign = + (reinterpret_cast(keys) / sizeof(T)) & (N - 1); + HWY_DASSERT(misalign % st.LanesPerKey() == 0); + const size_t consume = N - misalign; + { + const Vec v = LoadU(d, keys); + // Only check masked lanes; consider others to be equal. + const Mask diff = And(FirstN(d, consume), st.NotEqualKeys(d, v, pivot)); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = lane; + return false; + } + } + size_t i = consume; + HWY_DASSERT(((reinterpret_cast(keys + i) / sizeof(T)) & (N - 1)) == + 0); + + // Sticky bits registering any difference between `keys` and the first key. + // We use vector XOR because it may be cheaper than comparisons, especially + // for 128-bit. 2x unrolled for more ILP. + Vec diff0 = zero; + Vec diff1 = zero; + + // We want to stop once a difference has been found, but without slowing + // down the loop by comparing during each iteration. The compromise is to + // compare after a 'group', which consists of kLoops times two vectors. + constexpr size_t kLoops = 8; + const size_t lanes_per_group = kLoops * 2 * N; + + for (; i + lanes_per_group <= num; i += lanes_per_group) { + HWY_DEFAULT_UNROLL + for (size_t loop = 0; loop < kLoops; ++loop) { + const Vec v0 = Load(d, keys + i + loop * 2 * N); + const Vec v1 = Load(d, keys + i + loop * 2 * N + N); + diff0 = OrXor(diff0, v0, pivot); + diff1 = OrXor(diff1, v1, pivot); + } + diff0 = Or(diff0, diff1); + + // If there was a difference in the entire group: (use du because we must + // avoid floating-point comparisons for -0) + const RebindToUnsigned du; + if (HWY_UNLIKELY(!AllTrue(du, Eq(BitCast(du, diff0), Zero(du))))) { + // .. then loop until the first one, with termination guarantee. + for (;; i += N) { + const Vec v = Load(d, keys + i); + const Mask diff = st.NotEqualKeys(d, v, pivot); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = i + lane; + return false; + } + } + } + } + + // Whole vectors, no unrolling, compare directly + for (; i + N <= num; i += N) { + const Vec v = Load(d, keys + i); + const Mask diff = st.NotEqualKeys(d, v, pivot); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = i + lane; + return false; + } + } + // Always re-check the last (unaligned) vector to reduce branching. + i = num - N; + const Vec v = LoadU(d, keys + i); + const Mask diff = st.NotEqualKeys(d, v, pivot); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = i + lane; + return false; + } + + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "All keys equal\n"); + } + return true; // all equal +} + +template +HWY_NOINLINE bool ExistsAnyBefore(D d, Traits st, const T* HWY_RESTRICT keys, + size_t num, const Vec pivot) { + const size_t N = Lanes(d); + HWY_DASSERT(num >= N); // See HandleSpecialCases + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Scanning for before\n"); + } + + size_t i = 0; + + constexpr size_t kLoops = 16; + const size_t lanes_per_group = kLoops * N; + + Vec first = pivot; + + // Whole group, unrolled + for (; i + lanes_per_group <= num; i += lanes_per_group) { + HWY_DEFAULT_UNROLL + for (size_t loop = 0; loop < kLoops; ++loop) { + const Vec curr = LoadU(d, keys + i + loop * N); + first = st.First(d, first, curr); + } + + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, first, pivot)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at end of group %zu\n", + i + lanes_per_group); + } + return true; + } + } + // Whole vectors, no unrolling + for (; i + N <= num; i += N) { + const Vec curr = LoadU(d, keys + i); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at %zu\n", i); + } + return true; + } + } + // If there are remainders, re-check the last whole vector. + if (HWY_LIKELY(i != num)) { + const Vec curr = LoadU(d, keys + num - N); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at last %zu\n", num - N); + } + return true; + } + } + + return false; // pivot is the first +} + +template +HWY_NOINLINE bool ExistsAnyAfter(D d, Traits st, const T* HWY_RESTRICT keys, + size_t num, const Vec pivot) { + const size_t N = Lanes(d); + HWY_DASSERT(num >= N); // See HandleSpecialCases + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Scanning for after\n"); + } + + size_t i = 0; + + constexpr size_t kLoops = 16; + const size_t lanes_per_group = kLoops * N; + + Vec last = pivot; + + // Whole group, unrolled + for (; i + lanes_per_group <= num; i += lanes_per_group) { + HWY_DEFAULT_UNROLL + for (size_t loop = 0; loop < kLoops; ++loop) { + const Vec curr = LoadU(d, keys + i + loop * N); + last = st.Last(d, last, curr); + } + + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, last)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at end of group %zu\n", + i + lanes_per_group); + } + return true; + } + } + // Whole vectors, no unrolling + for (; i + N <= num; i += N) { + const Vec curr = LoadU(d, keys + i); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at %zu\n", i); + } + return true; + } + } + // If there are remainders, re-check the last whole vector. + if (HWY_LIKELY(i != num)) { + const Vec curr = LoadU(d, keys + num - N); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at last %zu\n", num - N); + } + return true; + } + } + + return false; // pivot is the last +} + +// Returns pivot chosen from `keys[0, num)`. It will never be the largest key +// (thus the right partition will never be empty). +template +HWY_INLINE Vec ChoosePivotForEqualSamples(D d, Traits st, + T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT samples, + Vec second, Vec third, + PivotResult& result) { + const Vec pivot = st.SetKey(d, samples); // the single unique sample + + // Early out for mostly-0 arrays, where pivot is often FirstValue. + if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.FirstValue(d))))) { + result = PivotResult::kIsFirst; + return pivot; + } + if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.LastValue(d))))) { + result = PivotResult::kWasLast; + return st.PrevValue(d, pivot); + } + + // Check if pivot is between two known values. If so, it is not the first nor + // the last and we can avoid scanning. + st.Sort2(d, second, third); + HWY_DASSERT(AllTrue(d, st.Compare(d, second, third))); + const bool before = !AllFalse(d, st.Compare(d, second, pivot)); + const bool after = !AllFalse(d, st.Compare(d, pivot, third)); + // Only reached if there are three keys, which means pivot is either first, + // last, or in between. Thus there is another key that comes before or after. + HWY_DASSERT(before || after); + if (HWY_UNLIKELY(before)) { + // Neither first nor last. + if (HWY_UNLIKELY(after || ExistsAnyAfter(d, st, keys, num, pivot))) { + result = PivotResult::kNormal; + return pivot; + } + + // We didn't find anything after pivot, so it is the last. Because keys + // equal to the pivot go to the left partition, the right partition would be + // empty and Partition will not have changed anything. Instead use the + // previous value in sort order, which is not necessarily an actual key. + result = PivotResult::kWasLast; + return st.PrevValue(d, pivot); + } + + // Has after, and we found one before: in the middle. + if (HWY_UNLIKELY(ExistsAnyBefore(d, st, keys, num, pivot))) { + result = PivotResult::kNormal; + return pivot; + } + + // Pivot is first. We could consider a special partition mode that only + // reads from and writes to the right side, and later fills in the left + // side, which we know is equal to the pivot. However, that leads to more + // cache misses if the array is large, and doesn't save much, hence is a + // net loss. + result = PivotResult::kIsFirst; + return pivot; +} + +// ------------------------------ Quicksort recursion + +template +HWY_NOINLINE void PrintMinMax(D d, Traits st, const T* HWY_RESTRICT keys, + size_t num, T* HWY_RESTRICT buf) { + if (VQSORT_PRINT >= 2) { + const size_t N = Lanes(d); + if (num < N) return; + + Vec first = st.LastValue(d); + Vec last = st.FirstValue(d); + + size_t i = 0; + for (; i + N <= num; i += N) { + const Vec v = LoadU(d, keys + i); + first = st.First(d, v, first); + last = st.Last(d, v, last); + } + if (HWY_LIKELY(i != num)) { + HWY_DASSERT(num >= N); // See HandleSpecialCases + const Vec v = LoadU(d, keys + num - N); + first = st.First(d, v, first); + last = st.Last(d, v, last); + } + + first = st.FirstOfLanes(d, first, buf); + last = st.LastOfLanes(d, last, buf); + MaybePrintVector(d, "first", first, 0, st.LanesPerKey()); + MaybePrintVector(d, "last", last, 0, st.LanesPerKey()); + } +} + +// keys_end is the end of the entire user input, not just the current subarray +// [keys, keys + num). +template +HWY_NOINLINE void Recurse(D d, Traits st, T* HWY_RESTRICT keys, + T* HWY_RESTRICT keys_end, const size_t num, + T* HWY_RESTRICT buf, Generator& rng, + size_t remaining_levels) { + HWY_DASSERT(num != 0); + + if (HWY_UNLIKELY(num <= Constants::BaseCaseNum(Lanes(d)))) { + BaseCase(d, st, keys, keys_end, num, buf); + return; + } + + // Move after BaseCase so we skip printing for small subarrays. + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "\n\n=== Recurse depth=%zu len=%zu\n", remaining_levels, + num); + PrintMinMax(d, st, keys, num, buf); + } + + DrawSamples(d, st, keys, num, buf, rng); + + Vec pivot; + PivotResult result = PivotResult::kNormal; + if (HWY_UNLIKELY(UnsortedSampleEqual(d, st, buf))) { + pivot = st.SetKey(d, buf); + size_t idx_second = 0; + if (HWY_UNLIKELY(AllEqual(d, st, pivot, keys, num, &idx_second))) { + return; + } + HWY_DASSERT(idx_second % st.LanesPerKey() == 0); + // Must capture the value before PartitionIfTwoKeys may overwrite it. + const Vec second = st.SetKey(d, keys + idx_second); + MaybePrintVector(d, "pivot", pivot, 0, st.LanesPerKey()); + MaybePrintVector(d, "second", second, 0, st.LanesPerKey()); + + Vec third; + if (HWY_UNLIKELY(PartitionIfTwoKeys(d, st, pivot, keys, num, idx_second, + second, third, buf))) { + return; // Done, skip recursion because each side has all-equal keys. + } + + // We can no longer start scanning from idx_second because + // PartitionIfTwoKeys may have reordered keys. + pivot = ChoosePivotForEqualSamples(d, st, keys, num, buf, second, third, + result); + // If kNormal, `pivot` is very common but not the first/last. It is + // tempting to do a 3-way partition (to avoid moving the =pivot keys a + // second time), but that is a net loss due to the extra comparisons. + } else { + SortSamples(d, st, buf); + + if (HWY_UNLIKELY(PartitionIfTwoSamples(d, st, keys, num, buf))) { + return; + } + + pivot = ChoosePivotByRank(d, st, buf); + } + + // Too many recursions. This is unlikely to happen because we select pivots + // from large (though still O(1)) samples. + if (HWY_UNLIKELY(remaining_levels == 0)) { + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "HeapSort reached, size=%zu\n", num); + } + HeapSort(st, keys, num); // Slow but N*logN. + return; + } + + const size_t bound = Partition(d, st, keys, num, pivot, buf); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "bound %zu num %zu result %s\n", bound, num, + PivotResultString(result)); + } + if (HWY_LIKELY(result != PivotResult::kIsFirst)) { + // The left partition is not empty because the pivot is one of the keys. + HWY_DASSERT(0 != bound && bound != num); + Recurse(d, st, keys, keys_end, bound, buf, rng, remaining_levels - 1); + } + if (HWY_LIKELY(result != PivotResult::kWasLast)) { + // ChoosePivot* ensure pivot != last, so the right partition is never empty. + HWY_DASSERT(bound != num); + Recurse(d, st, keys + bound, keys_end, num - bound, buf, rng, + remaining_levels - 1); + } +} + +// Returns true if sorting is finished. +template +HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, + size_t num) { + const size_t N = Lanes(d); + const size_t base_case_num = Constants::BaseCaseNum(N); + + // 128-bit keys require vectors with at least two u64 lanes, which is always + // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the + // hardware vector width is less than 128bit / fraction. + const bool partial_128 = !IsFull(d) && N < 2 && st.Is128(); + // Partition assumes its input is at least two vectors. If vectors are huge, + // base_case_num may actually be smaller. If so, which is only possible on + // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of + // HWY_LANES to account for the largest possible LMUL. + constexpr bool kPotentiallyHuge = + HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols; + const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num); + if (partial_128 || huge_vec) { + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n", + partial_128, huge_vec); + } + HeapSort(st, keys, num); + return true; + } + + // Small arrays are already handled by Recurse. + + // We could also check for already sorted/reverse/equal, but that's probably + // counterproductive if vqsort is used as a base case. + + return false; // not finished sorting +} + +#endif // VQSORT_ENABLED +} // namespace detail + +// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`. +// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons. +// Non-stable (order of equal keys may change), except for the common case where +// the upper bits of T are the key, and the lower bits are a sequential or at +// least unique ID. +// There is no upper limit on `num`, but note that pivots may be chosen by +// sampling only from the first 256 GiB. +// +// `d` is typically SortTag (chooses between full and partial vectors). +// `st` is SharedTraits>. This abstraction layer bridges +// differences in sort order and single-lane vs 128-bit keys. +template +void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf) { + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "=============== Sort num %zu\n", num); + } + +#if VQSORT_ENABLED || HWY_IDE +#if !HWY_HAVE_SCALABLE + // On targets with fixed-size vectors, avoid _using_ the allocated memory. + // We avoid (potentially expensive for small input sizes) allocations on + // platforms where no targets are scalable. For 512-bit vectors, this fits on + // the stack (several KiB). + HWY_ALIGN T storage[SortConstants::BufNum(HWY_LANES(T))] = {}; + static_assert(sizeof(storage) <= 8192, "Unexpectedly large, check size"); + buf = storage; +#endif // !HWY_HAVE_SCALABLE + + if (detail::HandleSpecialCases(d, st, keys, num)) return; + +#if HWY_MAX_BYTES > 64 + // sorting_networks-inl and traits assume no more than 512 bit vectors. + if (HWY_UNLIKELY(Lanes(d) > 64 / sizeof(T))) { + return Sort(CappedTag(), st, keys, num, buf); + } +#endif // HWY_MAX_BYTES > 64 + + detail::Generator rng(keys, num); + + // Introspection: switch to worst-case N*logN heapsort after this many. + const size_t max_levels = 2 * hwy::CeilLog2(num) + 4; + detail::Recurse(d, st, keys, keys + num, num, buf, rng, max_levels); +#else + (void)d; + (void)buf; + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n"); + } + return detail::HeapSort(st, keys, num); +#endif // VQSORT_ENABLED +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE diff --git a/hwy/contrib/sort/vqsort.cc b/hwy/contrib/sort/vqsort.cc new file mode 100644 index 0000000..b3bac07 --- /dev/null +++ b/hwy/contrib/sort/vqsort.cc @@ -0,0 +1,184 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#include // memset + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/shared-inl.h" + +// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an +// optimization that replaces dynamic allocation with stack storage. +#ifndef VQSORT_STACK +#if HWY_ARCH_X86 || HWY_ARCH_WASM +#define VQSORT_STACK 1 +#else +#define VQSORT_STACK 0 +#endif +#endif // VQSORT_STACK + +#if !VQSORT_STACK +#include "hwy/aligned_allocator.h" +#endif + +// Check if we have sys/random.h. First skip some systems on which the check +// itself (features.h) might be problematic. +#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV +#define VQSORT_GETRANDOM 0 +#endif + +#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX +#include + +// ---- which libc +#if defined(__UCLIBC__) +#define VQSORT_GETRANDOM 1 // added Mar 2015, before uclibc-ng 1.0 + +#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 25) +#define VQSORT_GETRANDOM 1 +#else +#define VQSORT_GETRANDOM 0 +#endif + +#else +// Assume MUSL, which has getrandom since 2018. There is no macro to test, see +// https://www.openwall.com/lists/musl/2013/03/29/13. +#define VQSORT_GETRANDOM 1 + +#endif // ---- which libc +#endif // linux + +#if !defined(VQSORT_GETRANDOM) +#define VQSORT_GETRANDOM 0 +#endif + +// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom +// (not all Android support the getrandom wrapper) +#ifndef VQSORT_SECURE_SEED + +#if VQSORT_GETRANDOM +#define VQSORT_SECURE_SEED 1 +#elif defined(_WIN32) || defined(_WIN64) +#define VQSORT_SECURE_SEED 2 +#else +#define VQSORT_SECURE_SEED 0 +#endif + +#endif // VQSORT_SECURE_SEED + +#if !VQSORT_SECURE_RNG + +#include +#if VQSORT_SECURE_SEED == 1 +#include +#elif VQSORT_SECURE_SEED == 2 +#include +#pragma comment(lib, "advapi32.lib") +// Must come after windows.h. +#include +#endif // VQSORT_SECURE_SEED + +#endif // !VQSORT_SECURE_RNG + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +size_t VectorSize() { return Lanes(ScalableTag()); } +bool HaveFloat64() { return HWY_HAVE_FLOAT64; } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(VectorSize); +HWY_EXPORT(HaveFloat64); + +} // namespace + +Sorter::Sorter() { +#if VQSORT_STACK + ptr_ = nullptr; // Sort will use stack storage instead +#else + // Determine the largest buffer size required for any type by trying them all. + // (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t + // may require a larger buffer.) + const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)(); + const size_t max_bytes = + HWY_MAX(HWY_MAX(SortConstants::BufBytes(vector_size), + SortConstants::BufBytes(vector_size)), + SortConstants::BufBytes(vector_size)); + ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr); + + // Prevent msan errors by initializing. + memset(ptr_, 0, max_bytes); +#endif +} + +void Sorter::Delete() { +#if !VQSORT_STACK + FreeAlignedBytes(ptr_, nullptr, nullptr); + ptr_ = nullptr; +#endif +} + +#if !VQSORT_SECURE_RNG + +void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) { +#if VQSORT_SECURE_SEED == 1 + // May block if urandom is not yet initialized. + const ssize_t ret = getrandom(bytes, 24, /*flags=*/0); + if (ret == 24) return; +#elif VQSORT_SECURE_SEED == 2 + HCRYPTPROV hProvider{}; + if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL, + CRYPT_VERIFYCONTEXT)) { + const BOOL ok = + CryptGenRandom(hProvider, 24, reinterpret_cast(bytes)); + CryptReleaseContext(hProvider, 0); + if (ok) return; + } +#endif + + // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from + // stack/heap/code addresses and the clock() timer. + uint64_t* words = reinterpret_cast(bytes); + uint64_t** seed_stack = &words; + void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes; + const uintptr_t bits_stack = reinterpret_cast(seed_stack); + const uintptr_t bits_heap = reinterpret_cast(seed_heap); + const uintptr_t bits_code = reinterpret_cast(seed_code); + const uint64_t bits_time = static_cast(clock()); + words[0] = bits_stack ^ bits_time ^ seed_num; + words[1] = bits_heap ^ bits_time ^ seed_num; + words[2] = bits_code ^ bits_time ^ seed_num; +} + +#endif // !VQSORT_SECURE_RNG + +bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); } + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort.h b/hwy/contrib/sort/vqsort.h new file mode 100644 index 0000000..88d78ac --- /dev/null +++ b/hwy/contrib/sort/vqsort.h @@ -0,0 +1,108 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Interface to vectorized quicksort with dynamic dispatch. +// Blog post: https://tinyurl.com/vqsort-blog +// Paper with measurements: https://arxiv.org/abs/2205.05982 +// +// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is +// worthwhile, we recommend using this code for sorting arrays whose size is at +// least 512 KiB. + +#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ + +#include "hwy/base.h" + +namespace hwy { + +// Tag arguments that determine the sort order. +struct SortAscending { + constexpr bool IsAscending() const { return true; } +}; +struct SortDescending { + constexpr bool IsAscending() const { return false; } +}; + +// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h. +// This allows amortizing the allocation over multiple sorts. +class HWY_CONTRIB_DLLEXPORT Sorter { + public: + Sorter(); + ~Sorter() { Delete(); } + + // Move-only + Sorter(const Sorter&) = delete; + Sorter& operator=(const Sorter&) = delete; + Sorter(Sorter&& other) { + Delete(); + ptr_ = other.ptr_; + other.ptr_ = nullptr; + } + Sorter& operator=(Sorter&& other) { + Delete(); + ptr_ = other.ptr_; + other.ptr_ = nullptr; + return *this; + } + + // Sorts keys[0, n). Dispatches to the best available instruction set, + // and does not allocate memory. + void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const; + + // For internal use only + static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes); + static bool HaveFloat64(); + + private: + void Delete(); + + template + T* Get() const { + return static_cast(ptr_); + } + + void* ptr_ = nullptr; +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ diff --git a/hwy/contrib/sort/vqsort_128a.cc b/hwy/contrib/sort/vqsort_128a.cc new file mode 100644 index 0000000..40daea8 --- /dev/null +++ b/hwy/contrib/sort/vqsort_128a.cc @@ -0,0 +1,62 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { +#if VQSORT_ENABLED + SortTag d; + detail::SharedTraits> st; + Sort(d, st, keys, num, buf); +#else + (void) keys; + (void) num; + (void) buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(Sort128Asc); +} // namespace + +void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(Sort128Asc) + (reinterpret_cast(keys), n * 2, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_128d.cc b/hwy/contrib/sort/vqsort_128d.cc new file mode 100644 index 0000000..357da84 --- /dev/null +++ b/hwy/contrib/sort/vqsort_128d.cc @@ -0,0 +1,62 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { +#if VQSORT_ENABLED + SortTag d; + detail::SharedTraits> st; + Sort(d, st, keys, num, buf); +#else + (void) keys; + (void) num; + (void) buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(Sort128Desc); +} // namespace + +void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(Sort128Desc) + (reinterpret_cast(keys), n * 2, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_f32a.cc b/hwy/contrib/sort/vqsort_f32a.cc new file mode 100644 index 0000000..3856eea --- /dev/null +++ b/hwy/contrib/sort/vqsort_f32a.cc @@ -0,0 +1,53 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF32Asc); +} // namespace + +void Sorter::operator()(float* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_f32d.cc b/hwy/contrib/sort/vqsort_f32d.cc new file mode 100644 index 0000000..7f5f97c --- /dev/null +++ b/hwy/contrib/sort/vqsort_f32d.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF32Desc(float* HWY_RESTRICT keys, size_t num, + float* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF32Desc); +} // namespace + +void Sorter::operator()(float* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_f64a.cc b/hwy/contrib/sort/vqsort_f64a.cc new file mode 100644 index 0000000..287d521 --- /dev/null +++ b/hwy/contrib/sort/vqsort_f64a.cc @@ -0,0 +1,61 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF64Asc(double* HWY_RESTRICT keys, size_t num, + double* HWY_RESTRICT buf) { +#if HWY_HAVE_FLOAT64 + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +#else + (void)keys; + (void)num; + (void)buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF64Asc); +} // namespace + +void Sorter::operator()(double* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_f64d.cc b/hwy/contrib/sort/vqsort_f64d.cc new file mode 100644 index 0000000..74d40c1 --- /dev/null +++ b/hwy/contrib/sort/vqsort_f64d.cc @@ -0,0 +1,61 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF64Desc(double* HWY_RESTRICT keys, size_t num, + double* HWY_RESTRICT buf) { +#if HWY_HAVE_FLOAT64 + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +#else + (void)keys; + (void)num; + (void)buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF64Desc); +} // namespace + +void Sorter::operator()(double* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_i16a.cc b/hwy/contrib/sort/vqsort_i16a.cc new file mode 100644 index 0000000..ef4bb75 --- /dev/null +++ b/hwy/contrib/sort/vqsort_i16a.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num, + int16_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI16Asc); +} // namespace + +void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_i16d.cc b/hwy/contrib/sort/vqsort_i16d.cc new file mode 100644 index 0000000..6507ed6 --- /dev/null +++ b/hwy/contrib/sort/vqsort_i16d.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num, + int16_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI16Desc); +} // namespace + +void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_i32a.cc b/hwy/contrib/sort/vqsort_i32a.cc new file mode 100644 index 0000000..ae65be9 --- /dev/null +++ b/hwy/contrib/sort/vqsort_i32a.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num, + int32_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI32Asc); +} // namespace + +void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_i32d.cc b/hwy/contrib/sort/vqsort_i32d.cc new file mode 100644 index 0000000..3ce276e --- /dev/null +++ b/hwy/contrib/sort/vqsort_i32d.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num, + int32_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI32Desc); +} // namespace + +void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_i64a.cc b/hwy/contrib/sort/vqsort_i64a.cc new file mode 100644 index 0000000..901b8ea --- /dev/null +++ b/hwy/contrib/sort/vqsort_i64a.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num, + int64_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI64Asc); +} // namespace + +void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_i64d.cc b/hwy/contrib/sort/vqsort_i64d.cc new file mode 100644 index 0000000..7713f2e --- /dev/null +++ b/hwy/contrib/sort/vqsort_i64d.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num, + int64_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI64Desc); +} // namespace + +void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_kv128a.cc b/hwy/contrib/sort/vqsort_kv128a.cc new file mode 100644 index 0000000..1e02742 --- /dev/null +++ b/hwy/contrib/sort/vqsort_kv128a.cc @@ -0,0 +1,65 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { +#if VQSORT_ENABLED + SortTag d; + detail::SharedTraits> st; + Sort(d, st, keys, num, buf); +#else + (void) keys; + (void) num; + (void) buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV128Asc); +} // namespace + +void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortKV128Asc) + (reinterpret_cast(keys), n * 2, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_kv128d.cc b/hwy/contrib/sort/vqsort_kv128d.cc new file mode 100644 index 0000000..3dd53b5 --- /dev/null +++ b/hwy/contrib/sort/vqsort_kv128d.cc @@ -0,0 +1,65 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { +#if VQSORT_ENABLED + SortTag d; + detail::SharedTraits> st; + Sort(d, st, keys, num, buf); +#else + (void) keys; + (void) num; + (void) buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV128Desc); +} // namespace + +void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortKV128Desc) + (reinterpret_cast(keys), n * 2, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_kv64a.cc b/hwy/contrib/sort/vqsort_kv64a.cc new file mode 100644 index 0000000..c513e3c --- /dev/null +++ b/hwy/contrib/sort/vqsort_kv64a.cc @@ -0,0 +1,65 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { +#if VQSORT_ENABLED + SortTag d; + detail::SharedTraits> st; + Sort(d, st, keys, num, buf); +#else + (void) keys; + (void) num; + (void) buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV64Asc); +} // namespace + +void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortKV64Asc) + (reinterpret_cast(keys), n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_kv64d.cc b/hwy/contrib/sort/vqsort_kv64d.cc new file mode 100644 index 0000000..c6c5fdc --- /dev/null +++ b/hwy/contrib/sort/vqsort_kv64d.cc @@ -0,0 +1,65 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { +#if VQSORT_ENABLED + SortTag d; + detail::SharedTraits> st; + Sort(d, st, keys, num, buf); +#else + (void) keys; + (void) num; + (void) buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV64Desc); +} // namespace + +void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortKV64Desc) + (reinterpret_cast(keys), n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_u16a.cc b/hwy/contrib/sort/vqsort_u16a.cc new file mode 100644 index 0000000..0a97ffa --- /dev/null +++ b/hwy/contrib/sort/vqsort_u16a.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num, + uint16_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU16Asc); +} // namespace + +void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_u16d.cc b/hwy/contrib/sort/vqsort_u16d.cc new file mode 100644 index 0000000..286ebbb --- /dev/null +++ b/hwy/contrib/sort/vqsort_u16d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num, + uint16_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> + st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU16Desc); +} // namespace + +void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_u32a.cc b/hwy/contrib/sort/vqsort_u32a.cc new file mode 100644 index 0000000..b6a69e6 --- /dev/null +++ b/hwy/contrib/sort/vqsort_u32a.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num, + uint32_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU32Asc); +} // namespace + +void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_u32d.cc b/hwy/contrib/sort/vqsort_u32d.cc new file mode 100644 index 0000000..38fc1e1 --- /dev/null +++ b/hwy/contrib/sort/vqsort_u32d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num, + uint32_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> + st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU32Desc); +} // namespace + +void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_u64a.cc b/hwy/contrib/sort/vqsort_u64a.cc new file mode 100644 index 0000000..a29824a --- /dev/null +++ b/hwy/contrib/sort/vqsort_u64a.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU64Asc); +} // namespace + +void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/contrib/sort/vqsort_u64d.cc b/hwy/contrib/sort/vqsort_u64d.cc new file mode 100644 index 0000000..d692458 --- /dev/null +++ b/hwy/contrib/sort/vqsort_u64d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { + SortTag d; + detail::SharedTraits>> + st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU64Desc); +} // namespace + +void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/detect_compiler_arch.h b/hwy/detect_compiler_arch.h new file mode 100644 index 0000000..98c6a55 --- /dev/null +++ b/hwy/detect_compiler_arch.h @@ -0,0 +1,234 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_ +#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_ + +// Detects compiler and arch from predefined macros. Zero dependencies for +// inclusion by foreach_target.h. + +// Add to #if conditions to prevent IDE from graying out code. +#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \ + (defined Q_CREATOR_RUN) || (defined(__CLANGD__)) +#define HWY_IDE 1 +#else +#define HWY_IDE 0 +#endif + +//------------------------------------------------------------------------------ +// Compiler + +// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like +// MSVC in other aspects (e.g. HWY_DIAGNOSTICS). +#if defined(_MSC_VER) && !defined(__clang__) +#define HWY_COMPILER_MSVC _MSC_VER +#else +#define HWY_COMPILER_MSVC 0 +#endif + +#if defined(_MSC_VER) && defined(__clang__) +#define HWY_COMPILER_CLANGCL _MSC_VER +#else +#define HWY_COMPILER_CLANGCL 0 +#endif + +#ifdef __INTEL_COMPILER +#define HWY_COMPILER_ICC __INTEL_COMPILER +#else +#define HWY_COMPILER_ICC 0 +#endif + +#ifdef __INTEL_LLVM_COMPILER +#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER +#else +#define HWY_COMPILER_ICX 0 +#endif + +// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU +// compiler extensions (eg. Clang, Intel...) +#ifdef __GNUC__ +#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__) +#else +#define HWY_COMPILER_GCC 0 +#endif + +// Clang or clang-cl, not GCC. +#ifdef __clang__ +// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or +// an invalid version number, deduce it from the presence of warnings. +// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h. +#if defined(__apple_build_version__) || __clang_major__ >= 999 +#if __has_warning("-Wbitwise-instead-of-logical") +#define HWY_COMPILER_CLANG 1400 +#elif __has_warning("-Wreserved-identifier") +#define HWY_COMPILER_CLANG 1300 +#elif __has_warning("-Wformat-insufficient-args") +#define HWY_COMPILER_CLANG 1200 +#elif __has_warning("-Wimplicit-const-int-float-conversion") +#define HWY_COMPILER_CLANG 1100 +#elif __has_warning("-Wmisleading-indentation") +#define HWY_COMPILER_CLANG 1000 +#elif defined(__FILE_NAME__) +#define HWY_COMPILER_CLANG 900 +#elif __has_warning("-Wextra-semi-stmt") || \ + __has_builtin(__builtin_rotateleft32) +#define HWY_COMPILER_CLANG 800 +// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently +// based on Clang 7, but does not support the warning we test. +// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and +// https://trac.macports.org/wiki/XcodeVersionInfo. +#elif __has_warning("-Wc++98-compat-extra-semi") || \ + (defined(__apple_build_version__) && __apple_build_version__ >= 10010000) +#define HWY_COMPILER_CLANG 700 +#else // Anything older than 7.0 is not recommended for Highway. +#define HWY_COMPILER_CLANG 600 +#endif // __has_warning chain +#else // use normal version +#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__) +#endif +#else // Not clang +#define HWY_COMPILER_CLANG 0 +#endif + +#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG +#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC +#else +#define HWY_COMPILER_GCC_ACTUAL 0 +#endif + +// More than one may be nonzero, but we want at least one. +#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \ + HWY_COMPILER_GCC + HWY_COMPILER_CLANG) +#error "Unsupported compiler" +#endif + +// We should only detect one of these (only clang/clangcl overlap) +#if 1 < \ + (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \ + !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG)) +#error "Detected multiple compilers" +#endif + +#ifdef __has_builtin +#define HWY_HAS_BUILTIN(name) __has_builtin(name) +#else +#define HWY_HAS_BUILTIN(name) 0 +#endif + +#ifdef __has_attribute +#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name) +#else +#define HWY_HAS_ATTRIBUTE(name) 0 +#endif + +#ifdef __has_feature +#define HWY_HAS_FEATURE(name) __has_feature(name) +#else +#define HWY_HAS_FEATURE(name) 0 +#endif + +//------------------------------------------------------------------------------ +// Architecture + +#if defined(__i386__) || defined(_M_IX86) +#define HWY_ARCH_X86_32 1 +#else +#define HWY_ARCH_X86_32 0 +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define HWY_ARCH_X86_64 1 +#else +#define HWY_ARCH_X86_64 0 +#endif + +#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64 +#error "Cannot have both x86-32 and x86-64" +#endif + +#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64 +#define HWY_ARCH_X86 1 +#else +#define HWY_ARCH_X86 0 +#endif + +#if defined(__powerpc64__) || defined(_M_PPC) +#define HWY_ARCH_PPC 1 +#else +#define HWY_ARCH_PPC 0 +#endif + +#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64) +#define HWY_ARCH_ARM_A64 1 +#else +#define HWY_ARCH_ARM_A64 0 +#endif + +#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7) +#define HWY_ARCH_ARM_V7 1 +#else +#define HWY_ARCH_ARM_V7 0 +#endif + +#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7 +#error "Cannot have both A64 and V7" +#endif + +// Any *supported* version of Arm, i.e. 7 or later +#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7 +#define HWY_ARCH_ARM 1 +#else +#define HWY_ARCH_ARM 0 +#endif + +// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD. +#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM +#define HWY_ARCH_ARM_OLD 1 +#else +#define HWY_ARCH_ARM_OLD 0 +#endif + +#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__) +#define HWY_ARCH_WASM 1 +#else +#define HWY_ARCH_WASM 0 +#endif + +#ifdef __riscv +#define HWY_ARCH_RVV 1 +#else +#define HWY_ARCH_RVV 0 +#endif + +// It is an error to detect multiple architectures at the same time, but OK to +// detect none of the above. +#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \ + HWY_ARCH_WASM + HWY_ARCH_RVV) > 1 +#error "Must not detect more than one architecture" +#endif + +#if defined(_WIN32) || defined(_WIN64) +#define HWY_OS_WIN 1 +#else +#define HWY_OS_WIN 0 +#endif + +#if defined(linux) || defined(__linux__) +#define HWY_OS_LINUX 1 +#else +#define HWY_OS_LINUX 0 +#endif + +#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_ diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h new file mode 100644 index 0000000..7f7e179 --- /dev/null +++ b/hwy/detect_targets.h @@ -0,0 +1,478 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_ +#define HIGHWAY_HWY_DETECT_TARGETS_H_ + +// Defines targets and chooses which to enable. + +#include "hwy/detect_compiler_arch.h" + +//------------------------------------------------------------------------------ +// Optional configuration + +// See g3doc/quick_reference.md for documentation of these macros. + +// Uncomment to override the default baseline determined from predefined macros: +// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR) + +// Uncomment to override the default blocklist: +// #define HWY_BROKEN_TARGETS HWY_AVX3 + +// Uncomment to definitely avoid generating those target(s): +// #define HWY_DISABLED_TARGETS HWY_SSE4 + +// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating +// AVX2 target for VMs which support AVX2 but not the other instruction sets) +// #define HWY_DISABLE_BMI2_FMA + +// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled +// #define HWY_WANT_SSSE3 +// #define HWY_WANT_SSE4 + +//------------------------------------------------------------------------------ +// Targets + +// Unique bit value for each target. A lower value is "better" (e.g. more lanes) +// than a higher value within the same group/platform - see HWY_STATIC_TARGET. +// +// All values are unconditionally defined so we can test HWY_TARGETS without +// first checking the HWY_ARCH_*. +// +// The C99 preprocessor evaluates #if expressions using intmax_t types. This +// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on +// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now +// avoid overflow when computing HWY_TARGETS (subtracting one instead of +// left-shifting 2^62), but still do not use bit 63 because it is the sign bit. + +// --------------------------- x86: 15 targets (+ one fallback) +// Bits 0..6 reserved (7 targets) +// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2, +// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in +// Tiger Lake? We do not yet have uses for GFNI. +#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below +#define HWY_AVX3 (1LL << 8) +#define HWY_AVX2 (1LL << 9) +// Bit 10: reserved for AVX +#define HWY_SSE4 (1LL << 11) +#define HWY_SSSE3 (1LL << 12) +// Bits 13..14 reserved for SSE3 or SSE2 (2 targets) +// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for +// dynamic dispatch. All x86 target bits must be lower or equal to +// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use +// HWY_MAX_DYNAMIC_TARGETS in total. +#define HWY_HIGHEST_TARGET_BIT_X86 14 + +// --------------------------- Arm: 15 targets (+ one fallback) +// Bits 15..23 reserved (9 targets) +#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2) +#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1) +#define HWY_SVE2 (1LL << 26) +#define HWY_SVE (1LL << 27) +#define HWY_NEON (1LL << 28) // On A64, includes/requires AES +// Bit 29 reserved (Helium?) +#define HWY_HIGHEST_TARGET_BIT_ARM 29 + +// --------------------------- RISC-V: 9 targets (+ one fallback) +// Bits 30..36 reserved (7 targets) +#define HWY_RVV (1LL << 37) +// Bit 38 reserved +#define HWY_HIGHEST_TARGET_BIT_RVV 38 + +// --------------------------- Future expansion: 4 targets +// Bits 39..42 reserved + + +// --------------------------- IBM Power: 9 targets (+ one fallback) +// Bits 43..48 reserved (6 targets) +#define HWY_PPC8 (1LL << 49) // v2.07 or 3 +// Bits 50..51 reserved for prior VSX/AltiVec (2 targets) +#define HWY_HIGHEST_TARGET_BIT_PPC 51 + +// --------------------------- WebAssembly: 9 targets (+ one fallback) +// Bits 52..57 reserved (6 targets) +#define HWY_WASM_EMU256 (1LL << 58) // Experimental +#define HWY_WASM (1LL << 59) +// Bits 60 reserved +#define HWY_HIGHEST_TARGET_BIT_WASM 60 + +// --------------------------- Emulation: 2 targets + +#define HWY_EMU128 (1LL << 61) +// We do not add/left-shift, so this will not overflow to a negative number. +#define HWY_SCALAR (1LL << 62) +#define HWY_HIGHEST_TARGET_BIT_SCALAR 62 + +// Do not use bit 63 - would be confusing to have negative numbers. + +//------------------------------------------------------------------------------ +// Set default blocklists + +// Disabled means excluded from enabled at user's request. A separate config +// macro allows disabling without deactivating the blocklist below. +#ifndef HWY_DISABLED_TARGETS +#define HWY_DISABLED_TARGETS 0 +#endif + +// Broken means excluded from enabled due to known compiler issues. Allow the +// user to override this blocklist without any guarantee of success. +#ifndef HWY_BROKEN_TARGETS + +// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid +// SSE4 codegen (possibly only for msan), so disable all those targets. +#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) +#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL) +// This entails a major speed reduction, so warn unless the user explicitly +// opts in to scalar-only. +#if !defined(HWY_COMPILE_ONLY_SCALAR) +#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.") +#endif + +// 32-bit may fail to compile AVX2/3. +#elif HWY_ARCH_X86_32 +#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL) + +// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16 +#elif HWY_COMPILER_MSVC != 0 +#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL) + +// armv7be has not been tested and is not yet supported. +#elif HWY_ARCH_ARM_V7 && \ + (defined(__ARM_BIG_ENDIAN) || \ + (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN)) +#define HWY_BROKEN_TARGETS (HWY_NEON) + +// SVE[2] require recent clang or gcc versions. +#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \ + (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) +#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128) + +#else +#define HWY_BROKEN_TARGETS 0 +#endif + +#endif // HWY_BROKEN_TARGETS + +// Enabled means not disabled nor blocklisted. +#define HWY_ENABLED(targets) \ + ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS))) + +// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3: +// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate +// from HWY_BROKEN_TARGETS because it affects the fallback target, which must +// always be enabled. If 1, we instead choose HWY_SCALAR even without +// HWY_COMPILE_ONLY_SCALAR being set. +#if !defined(HWY_BROKEN_EMU128) // allow overriding +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203 +#define HWY_BROKEN_EMU128 1 +#else +#define HWY_BROKEN_EMU128 0 +#endif +#endif // HWY_BROKEN_EMU128 + +//------------------------------------------------------------------------------ +// Detect baseline targets using predefined macros + +// Baseline means the targets for which the compiler is allowed to generate +// instructions, implying the target CPU would have to support them. This does +// not take the blocklist into account. + +#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128 +#define HWY_BASELINE_SCALAR HWY_SCALAR +#else +#define HWY_BASELINE_SCALAR HWY_EMU128 +#endif + +// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with +// HWY_TARGET == HWY_BASELINE_SCALAR. + +#if HWY_ARCH_WASM && defined(__wasm_simd128__) +#if defined(HWY_WANT_WASM2) +#define HWY_BASELINE_WASM HWY_WASM_EMU256 +#else +#define HWY_BASELINE_WASM HWY_WASM +#endif // HWY_WANT_WASM2 +#else +#define HWY_BASELINE_WASM 0 +#endif + +// Avoid choosing the PPC target until we have an implementation. +#if HWY_ARCH_PPC && defined(__VSX__) && 0 +#define HWY_BASELINE_PPC8 HWY_PPC8 +#else +#define HWY_BASELINE_PPC8 0 +#endif + +#define HWY_BASELINE_SVE2 0 +#define HWY_BASELINE_SVE 0 +#define HWY_BASELINE_NEON 0 + +#if HWY_ARCH_ARM + +#if defined(__ARM_FEATURE_SVE2) +#undef HWY_BASELINE_SVE2 // was 0, will be re-defined +// If user specified -msve-vector-bits=128, they assert the vector length is +// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops). +#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128 +#define HWY_BASELINE_SVE2 HWY_SVE2_128 +// Otherwise we're not sure what the vector length will be. The baseline must be +// unconditionally valid, so we can only assume HWY_SVE2. However, when running +// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will +// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS. +#else +#define HWY_BASELINE_SVE2 HWY_SVE2 +#endif // __ARM_FEATURE_SVE_BITS +#endif // __ARM_FEATURE_SVE2 + +#if defined(__ARM_FEATURE_SVE) +#undef HWY_BASELINE_SVE // was 0, will be re-defined +// See above. If user-specified vector length matches our optimization, use it. +#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256 +#define HWY_BASELINE_SVE HWY_SVE_256 +#else +#define HWY_BASELINE_SVE HWY_SVE +#endif // __ARM_FEATURE_SVE_BITS +#endif // __ARM_FEATURE_SVE + +// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both. +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#undef HWY_BASELINE_NEON +#define HWY_BASELINE_NEON HWY_NEON +#endif + +#endif // HWY_ARCH_ARM + +// Special handling for MSVC because it has fewer predefined macros: +#if HWY_COMPILER_MSVC + +// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is: +// https://stackoverflow.com/questions/18563978/. +#if defined(__AVX__) +#define HWY_CHECK_SSSE3 1 +#define HWY_CHECK_SSE4 1 +#else +#define HWY_CHECK_SSSE3 0 +#define HWY_CHECK_SSE4 0 +#endif + +// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume +// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is. +#define HWY_CHECK_PCLMUL_AES 1 +#define HWY_CHECK_BMI2_FMA 1 +#define HWY_CHECK_F16C 1 + +#else // non-MSVC + +#if defined(__SSSE3__) +#define HWY_CHECK_SSSE3 1 +#else +#define HWY_CHECK_SSSE3 0 +#endif + +#if defined(__SSE4_1__) && defined(__SSE4_2__) +#define HWY_CHECK_SSE4 1 +#else +#define HWY_CHECK_SSE4 0 +#endif + +// If these are disabled, they should not gate the availability of SSE4/AVX2. +#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__)) +#define HWY_CHECK_PCLMUL_AES 1 +#else +#define HWY_CHECK_PCLMUL_AES 0 +#endif + +#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__)) +#define HWY_CHECK_BMI2_FMA 1 +#else +#define HWY_CHECK_BMI2_FMA 0 +#endif + +#if defined(HWY_DISABLE_F16C) || defined(__F16C__) +#define HWY_CHECK_F16C 1 +#else +#define HWY_CHECK_F16C 0 +#endif + +#endif // non-MSVC + +#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3) +#define HWY_BASELINE_SSSE3 HWY_SSSE3 +#else +#define HWY_BASELINE_SSSE3 0 +#endif + +#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES)) +#define HWY_BASELINE_SSE4 HWY_SSE4 +#else +#define HWY_BASELINE_SSE4 0 +#endif + +#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \ + defined(__AVX2__) +#define HWY_BASELINE_AVX2 HWY_AVX2 +#else +#define HWY_BASELINE_AVX2 0 +#endif + +// Require everything in AVX2 plus AVX-512 flags (also set by MSVC) +#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \ + defined(__AVX512DQ__) && defined(__AVX512VL__) +#define HWY_BASELINE_AVX3 HWY_AVX3 +#else +#define HWY_BASELINE_AVX3 0 +#endif + +// TODO(janwas): not yet known whether these will be set by MSVC +#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \ + defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \ + defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \ + defined(__AVX512BITALG__) +#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL +#else +#define HWY_BASELINE_AVX3_DL 0 +#endif + +#if HWY_ARCH_RVV && defined(__riscv_vector) +#define HWY_BASELINE_RVV HWY_RVV +#else +#define HWY_BASELINE_RVV 0 +#endif + +// Allow the user to override this without any guarantee of success. +#ifndef HWY_BASELINE_TARGETS +#define HWY_BASELINE_TARGETS \ + (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \ + HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON | \ + HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \ + HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV) +#endif // HWY_BASELINE_TARGETS + +//------------------------------------------------------------------------------ +// Choose target for static dispatch + +#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS) +#if HWY_ENABLED_BASELINE == 0 +#error "At least one baseline target must be defined and enabled" +#endif + +// Best baseline, used for static dispatch. This is the least-significant 1-bit +// within HWY_ENABLED_BASELINE and lower bit values imply "better". +#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE) + +// Start by assuming static dispatch. If we later use dynamic dispatch, this +// will be defined to other targets during the multiple-inclusion, and finally +// return to the initial value. Defining this outside begin/end_target ensures +// inl headers successfully compile by themselves (required by Bazel). +#define HWY_TARGET HWY_STATIC_TARGET + +//------------------------------------------------------------------------------ +// Choose targets for dynamic dispatch according to one of four policies + +#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \ + defined(HWY_COMPILE_ONLY_STATIC)) +#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?" +#endif +// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE. + +// Clang, GCC and MSVC allow runtime dispatch on x86. +#if HWY_ARCH_X86 +#define HWY_HAVE_RUNTIME_DISPATCH 1 +// On Arm, currently only GCC does, and we require Linux to detect CPU +// capabilities. +#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX +#define HWY_HAVE_RUNTIME_DISPATCH 1 +#else +#define HWY_HAVE_RUNTIME_DISPATCH 0 +#endif + +// AVX3_DL is not widely available yet. To reduce code size and compile time, +// only include it in the set of attainable targets (for dynamic dispatch) if +// the user opts in, OR it is in the baseline (we check whether enabled below). +#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL) +#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL +#else +#define HWY_ATTAINABLE_AVX3_DL 0 +#endif + +#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \ + (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256))) +#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256) +#else +#define HWY_ATTAINABLE_SVE 0 +#endif + +#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \ + (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128))) +#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128) +#else +#define HWY_ATTAINABLE_SVE2 0 +#endif + +// Attainable means enabled and the compiler allows intrinsics (even when not +// allowed to autovectorize). Used in 3 and 4. +#if HWY_ARCH_X86 +#define HWY_ATTAINABLE_TARGETS \ + HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \ + HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL) +#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH +#define HWY_ATTAINABLE_TARGETS \ + HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \ + HWY_ATTAINABLE_SVE2) +#else +#define HWY_ATTAINABLE_TARGETS \ + (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2) +#endif + +// 1) For older compilers: avoid SIMD intrinsics, but still support all ops. +#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128 +#undef HWY_STATIC_TARGET +#define HWY_STATIC_TARGET HWY_EMU128 // override baseline +#define HWY_TARGETS HWY_EMU128 + +// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but +// we currently still support it for backwards compatibility. +#elif defined(HWY_COMPILE_ONLY_SCALAR) || \ + (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128) +#undef HWY_STATIC_TARGET +#define HWY_STATIC_TARGET HWY_SCALAR // override baseline +#define HWY_TARGETS HWY_SCALAR + +// 2) For forcing static dispatch without code changes (removing HWY_EXPORT) +#elif defined(HWY_COMPILE_ONLY_STATIC) +#define HWY_TARGETS HWY_STATIC_TARGET + +// 3) For tests: include all attainable targets (in particular: scalar) +#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST) +#define HWY_TARGETS HWY_ATTAINABLE_TARGETS + +// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by +// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET +// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one +// sets all lower bits (better targets), then we also include the static target. +#else +#define HWY_TARGETS \ + (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET)) + +#endif // target policy + +// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being +// one of the dynamic targets. This also implies HWY_TARGETS != 0 and +// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0. +#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0 +#error "Logic error: best baseline should be included in dynamic targets" +#endif + +#endif // HIGHWAY_HWY_DETECT_TARGETS_H_ diff --git a/hwy/examples/benchmark.cc b/hwy/examples/benchmark.cc new file mode 100644 index 0000000..8ab8108 --- /dev/null +++ b/hwy/examples/benchmark.cc @@ -0,0 +1,254 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include +#include +#include + +#include +#include // iota + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// Must come after foreach_target.h to avoid redefinition errors. +#include "hwy/aligned_allocator.h" +#include "hwy/highway.h" +#include "hwy/nanobenchmark.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +#if HWY_TARGET != HWY_SCALAR +using hwy::HWY_NAMESPACE::CombineShiftRightLanes; +#endif + +class TwoArray { + public: + // Must be a multiple of the vector lane count * 8. + static size_t NumItems() { return 3456; } + + TwoArray() + : a_(AllocateAligned(NumItems() * 2)), b_(a_.get() + NumItems()) { + // = 1, but compiler doesn't know + const float init = static_cast(Unpredictable1()); + std::iota(a_.get(), a_.get() + NumItems(), init); + std::iota(b_, b_ + NumItems(), init); + } + + protected: + AlignedFreeUniquePtr a_; + float* b_; +}; + +// Measures durations, verifies results, prints timings. +template +void RunBenchmark(const char* caption) { + printf("%10s: ", caption); + const size_t kNumInputs = 1; + const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1()); + const FuncInput inputs[kNumInputs] = {num_items}; + Result results[kNumInputs]; + + Benchmark benchmark; + + Params p; + p.verbose = false; + p.max_evals = 7; + p.target_rel_mad = 0.002; + const size_t num_results = MeasureClosure( + [&benchmark](const FuncInput input) { return benchmark(input); }, inputs, + kNumInputs, results, p); + if (num_results != kNumInputs) { + fprintf(stderr, "MeasureClosure failed.\n"); + } + + benchmark.Verify(num_items); + + for (size_t i = 0; i < num_results; ++i) { + const double cycles_per_item = + results[i].ticks / static_cast(results[i].input); + const double mad = results[i].variability * cycles_per_item; + printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n", + static_cast(results[i].input), cycles_per_item, mad); + } +} + +void Intro() { + const float in[16] = {1, 2, 3, 4, 5, 6}; + float out[16]; + const ScalableTag d; // largest possible vector + for (size_t i = 0; i < 16; i += Lanes(d)) { + const auto vec = LoadU(d, in + i); // no alignment requirement + auto result = Mul(vec, vec); + result = Add(result, result); // can update if not const + StoreU(result, d, out + i); + } + printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]); +} + +// BEGINNER: dot product +// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold! +class BenchmarkDot : public TwoArray { + public: + BenchmarkDot() : dot_{-1.0f} {} + + FuncOutput operator()(const size_t num_items) { + const ScalableTag d; + const size_t N = Lanes(d); + using V = decltype(Zero(d)); + // Compiler doesn't make independent sum* accumulators, so unroll manually. + // We cannot use an array because V might be a sizeless type. For reasonable + // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency). + V sum0 = Zero(d); + V sum1 = Zero(d); + V sum2 = Zero(d); + V sum3 = Zero(d); + const float* const HWY_RESTRICT pa = &a_[0]; + const float* const HWY_RESTRICT pb = b_; + for (size_t i = 0; i < num_items; i += 4 * N) { + const auto a0 = Load(d, pa + i + 0 * N); + const auto b0 = Load(d, pb + i + 0 * N); + sum0 = MulAdd(a0, b0, sum0); + const auto a1 = Load(d, pa + i + 1 * N); + const auto b1 = Load(d, pb + i + 1 * N); + sum1 = MulAdd(a1, b1, sum1); + const auto a2 = Load(d, pa + i + 2 * N); + const auto b2 = Load(d, pb + i + 2 * N); + sum2 = MulAdd(a2, b2, sum2); + const auto a3 = Load(d, pa + i + 3 * N); + const auto b3 = Load(d, pb + i + 3 * N); + sum3 = MulAdd(a3, b3, sum3); + } + // Reduction tree: sum of all accumulators by pairs into sum0. + sum0 = Add(sum0, sum1); + sum2 = Add(sum2, sum3); + sum0 = Add(sum0, sum2); + dot_ = GetLane(SumOfLanes(d, sum0)); + return static_cast(dot_); + } + void Verify(size_t num_items) { + if (dot_ == -1.0f) { + fprintf(stderr, "Dot: must call Verify after benchmark"); + abort(); + } + + const float expected = + std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f); + const float rel_err = std::abs(expected - dot_) / expected; + if (rel_err > 1.1E-6f) { + fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_, + rel_err); + abort(); + } + } + + private: + float dot_; // for Verify +}; + +// INTERMEDIATE: delta coding +// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold! +struct BenchmarkDelta : public TwoArray { + FuncOutput operator()(const size_t num_items) const { +#if HWY_TARGET == HWY_SCALAR + b_[0] = a_[0]; + for (size_t i = 1; i < num_items; ++i) { + b_[i] = a_[i] - a_[i - 1]; + } +#elif HWY_CAP_GE256 + // Larger vectors are split into 128-bit blocks, easiest to use the + // unaligned load support to shift between them. + const ScalableTag df; + const size_t N = Lanes(df); + size_t i; + b_[0] = a_[0]; + for (i = 1; i < N; ++i) { + b_[i] = a_[i] - a_[i - 1]; + } + for (; i < num_items; i += N) { + const auto a = Load(df, &a_[i]); + const auto shifted = LoadU(df, &a_[i - 1]); + Store(a - shifted, df, &b_[i]); + } +#else // 128-bit + // Slightly better than unaligned loads + const HWY_CAPPED(float, 4) df; + const size_t N = Lanes(df); + size_t i; + b_[0] = a_[0]; + for (i = 1; i < N; ++i) { + b_[i] = a_[i] - a_[i - 1]; + } + auto prev = Load(df, &a_[0]); + for (; i < num_items; i += Lanes(df)) { + const auto a = Load(df, &a_[i]); + const auto shifted = CombineShiftRightLanes<3>(df, a, prev); + prev = a; + Store(Sub(a, shifted), df, &b_[i]); + } +#endif + return static_cast(b_[num_items - 1]); + } + + void Verify(size_t num_items) { + for (size_t i = 0; i < num_items; ++i) { + const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1]; + const float err = std::abs(expected - b_[i]); + if (err > 1E-6f) { + fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]); + } + } + } +}; + +void RunBenchmarks() { + Intro(); + printf("------------------------ %s\n", TargetName(HWY_TARGET)); + RunBenchmark("dot"); + RunBenchmark("delta"); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +HWY_EXPORT(RunBenchmarks); + +void Run() { + for (int64_t target : SupportedAndGeneratedTargets()) { + SetSupportedTargetsForTest(target); + HWY_DYNAMIC_DISPATCH(RunBenchmarks)(); + } + SetSupportedTargetsForTest(0); // Reset the mask afterwards. +} + +} // namespace hwy + +int main(int /*argc*/, char** /*argv*/) { + hwy::Run(); + return 0; +} +#endif // HWY_ONCE diff --git a/hwy/examples/skeleton-inl.h b/hwy/examples/skeleton-inl.h new file mode 100644 index 0000000..8aec33e --- /dev/null +++ b/hwy/examples/skeleton-inl.h @@ -0,0 +1,66 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Demo of functions that might be called from multiple SIMD modules (either +// other -inl.h files, or a .cc file between begin/end_target-inl). This is +// optional - all SIMD code can reside in .cc files. However, this allows +// splitting code into different files while still inlining instead of requiring +// calling through function pointers. + +// Per-target include guard. This is only required when using dynamic dispatch, +// i.e. including foreach_target.h. For static dispatch, a normal include +// guard would be fine because the header is only compiled once. +#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#else +#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#endif + +// It is fine to #include normal or *-inl headers. +#include + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace skeleton { +namespace HWY_NAMESPACE { + +// Highway ops reside here; ADL does not find templates nor builtins. +namespace hn = hwy::HWY_NAMESPACE; + +// Example of a type-agnostic (caller-specified lane type) and width-agnostic +// (uses best available instruction set) function in a header. +// +// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size. +template +HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array, + const T* HWY_RESTRICT add_array, + const size_t size, T* HWY_RESTRICT x_array) { + for (size_t i = 0; i < size; i += hn::Lanes(d)) { + const auto mul = hn::Load(d, mul_array + i); + const auto add = hn::Load(d, add_array + i); + auto x = hn::Load(d, x_array + i); + x = hn::MulAdd(mul, x, add); + hn::Store(x, d, x_array + i); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace skeleton +HWY_AFTER_NAMESPACE(); + +#endif // include guard diff --git a/hwy/examples/skeleton.cc b/hwy/examples/skeleton.cc new file mode 100644 index 0000000..2e820b6 --- /dev/null +++ b/hwy/examples/skeleton.cc @@ -0,0 +1,121 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/examples/skeleton.h" + +#include + +// >>>> for dynamic dispatch only, skip if you want static dispatch + +// First undef to prevent error when re-included. +#undef HWY_TARGET_INCLUDE +// For dynamic dispatch, specify the name of the current file (unfortunately +// __FILE__ is not reliable) so that foreach_target.h can re-include it. +#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc" +// Generates code for each enabled target by re-including this source file. +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// <<<< end of dynamic dispatch + +// Must come after foreach_target.h to avoid redefinition errors. +#include "hwy/highway.h" + +// Optional, can instead add HWY_ATTR to all functions. +HWY_BEFORE_NAMESPACE(); + +namespace skeleton { +// This namespace name is unique per target, which allows code for multiple +// targets to co-exist in the same translation unit. Required when using dynamic +// dispatch, otherwise optional. +namespace HWY_NAMESPACE { + +// Highway ops reside here; ADL does not find templates nor builtins. +namespace hn = hwy::HWY_NAMESPACE; + +// Computes log2 by converting to a vector of floats. Compiled once per target. +template +HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df, + const uint8_t* HWY_RESTRICT values, + uint8_t* HWY_RESTRICT log2) { + // Type tags for converting to other element types (Rebind = same count). + const hn::RebindToSigned d32; + const hn::Rebind d8; + + const auto u8 = hn::Load(d8, values); + const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8))); + const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127)); + hn::Store(hn::DemoteTo(d8, exponent), d8, log2); +} + +void CodepathDemo() { + // Highway defaults to portability, but per-target codepaths may be selected + // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros: +#if HWY_HAVE_INTEGER64 + const char* gather = "Has int64"; +#else + const char* gather = "No int64"; +#endif + printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather); +} + +void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count, + uint8_t* HWY_RESTRICT log2) { + CodepathDemo(); + + const hn::ScalableTag df; + const size_t N = hn::Lanes(df); + size_t i = 0; + for (; i + N <= count; i += N) { + OneFloorLog2(df, values + i, log2 + i); + } + for (; i < count; ++i) { + hn::CappedTag d1; + OneFloorLog2(d1, values + i, log2 + i); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace skeleton +HWY_AFTER_NAMESPACE(); + +// The table of pointers to the various implementations in HWY_NAMESPACE must +// be compiled only once (foreach_target #includes this file multiple times). +// HWY_ONCE is true for only one of these 'compilation passes'. +#if HWY_ONCE + +namespace skeleton { + +// This macro declares a static array used for dynamic dispatch; it resides in +// the same outer namespace that contains FloorLog2. +HWY_EXPORT(FloorLog2); + +// This function is optional and only needed in the case of exposing it in the +// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module +// is equivalent to inlining this function. +HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in, + const size_t count, + uint8_t* HWY_RESTRICT out) { + // This must reside outside of HWY_NAMESPACE because it references (calls the + // appropriate one from) the per-target implementations there. + // For static dispatch, use HWY_STATIC_DISPATCH. + return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out); +} + +// Optional: anything to compile only once, e.g. non-SIMD implementations of +// public functions provided by this module, can go inside #if HWY_ONCE. + +} // namespace skeleton +#endif // HWY_ONCE diff --git a/hwy/examples/skeleton.h b/hwy/examples/skeleton.h new file mode 100644 index 0000000..381ac69 --- /dev/null +++ b/hwy/examples/skeleton.h @@ -0,0 +1,36 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Demo interface to target-specific code in skeleton.cc + +// Normal header with include guard and namespace. +#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_ +#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_ + +#include + +// Platform-specific definitions used for declaring an interface, independent of +// the SIMD instruction set. +#include "hwy/base.h" // HWY_RESTRICT + +namespace skeleton { + +// Computes base-2 logarithm by converting to float. Supports dynamic dispatch. +HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in, + const size_t count, uint8_t* HWY_RESTRICT out); + +} // namespace skeleton + +#endif // HIGHWAY_HWY_EXAMPLES_SKELETON_H_ diff --git a/hwy/examples/skeleton_test.cc b/hwy/examples/skeleton_test.cc new file mode 100644 index 0000000..c7c26bf --- /dev/null +++ b/hwy/examples/skeleton_test.cc @@ -0,0 +1,110 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Example of unit test for the "skeleton" library. + +#include "hwy/examples/skeleton.h" + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// Must come after foreach_target.h to avoid redefinition errors. +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +// Optional: factor out parts of the implementation into *-inl.h +// (must also come after foreach_target.h to avoid redefinition errors) +#include "hwy/examples/skeleton-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace skeleton { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +// Calls function defined in skeleton.cc. +struct TestFloorLog2 { + template + HWY_NOINLINE void operator()(T /*unused*/, DF df) { + const size_t count = 5 * hn::Lanes(df); + auto in = hwy::AllocateAligned(count); + auto expected = hwy::AllocateAligned(count); + + hwy::RandomState rng; + for (size_t i = 0; i < count; ++i) { + expected[i] = Random32(&rng) & 7; + in[i] = static_cast(1u << expected[i]); + } + auto out = hwy::AllocateAligned(count); + CallFloorLog2(in.get(), count, out.get()); + int sum = 0; + for (size_t i = 0; i < count; ++i) { + HWY_ASSERT_EQ(expected[i], out[i]); + sum += out[i]; + } + hwy::PreventElision(sum); + } +}; + +HWY_NOINLINE void TestAllFloorLog2() { + hn::ForPartialVectors()(float()); +} + +// Calls function defined in skeleton-inl.h. +struct TestSumMulAdd { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + hwy::RandomState rng; + const size_t count = 4096; + EXPECT_EQ(0, count % hn::Lanes(d)); + auto mul = hwy::AllocateAligned(count); + auto x = hwy::AllocateAligned(count); + auto add = hwy::AllocateAligned(count); + for (size_t i = 0; i < count; ++i) { + mul[i] = static_cast(Random32(&rng) & 0xF); + x[i] = static_cast(Random32(&rng) & 0xFF); + add[i] = static_cast(Random32(&rng) & 0xFF); + } + double expected_sum = 0.0; + for (size_t i = 0; i < count; ++i) { + expected_sum += mul[i] * x[i] + add[i]; + } + + MulAddLoop(d, mul.get(), add.get(), count, x.get()); + HWY_ASSERT_EQ(4344240.0, expected_sum); + } +}; + +HWY_NOINLINE void TestAllSumMulAdd() { + hn::ForFloatTypes(hn::ForPartialVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace skeleton +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace skeleton { +HWY_BEFORE_TEST(SkeletonTest); +HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2); +HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd); +} // namespace skeleton + +#endif diff --git a/hwy/foreach_target.h b/hwy/foreach_target.h new file mode 100644 index 0000000..3929905 --- /dev/null +++ b/hwy/foreach_target.h @@ -0,0 +1,261 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_ +#define HIGHWAY_HWY_FOREACH_TARGET_H_ + +// Re-includes the translation unit zero or more times to compile for any +// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that +// highway.h defines the corresponding macro/namespace. + +#include "hwy/detect_targets.h" + +// *_inl.h may include other headers, which requires include guards to prevent +// repeated inclusion. The guards must be reset after compiling each target, so +// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE, +// defining it if undefined and vice versa. This macro is initially undefined +// so that IDEs don't gray out the contents of each header. +#ifdef HWY_TARGET_TOGGLE +#error "This macro must not be defined outside foreach_target.h" +#endif + +#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard +// Trigger fixup at the bottom of this header. +#define HWY_ALREADY_INCLUDED + +// The next highway.h must re-include set_macros-inl.h because the first +// highway.h chose the static target instead of what we will set below. +#undef HWY_SET_MACROS_PER_TARGET +#endif + +// Disable HWY_EXPORT in user code until we have generated all targets. Note +// that a subsequent highway.h will not override this definition. +#undef HWY_ONCE +#define HWY_ONCE (0 || HWY_IDE) + +// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE; +// also skip if only 1 target defined (no re-inclusion will be necessary). +#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET) + +#if !defined(HWY_TARGET_INCLUDE) +#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h" +#endif + +#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128) +#undef HWY_TARGET +#define HWY_TARGET HWY_EMU128 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR) +#undef HWY_TARGET +#define HWY_TARGET HWY_SCALAR +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON) +#undef HWY_TARGET +#define HWY_TARGET HWY_NEON +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV) +#undef HWY_TARGET +#define HWY_TARGET HWY_RVV +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE) +#undef HWY_TARGET +#define HWY_TARGET HWY_SVE +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2) +#undef HWY_TARGET +#define HWY_TARGET HWY_SVE2 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256) +#undef HWY_TARGET +#define HWY_TARGET HWY_SVE_256 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128) +#undef HWY_TARGET +#define HWY_TARGET HWY_SVE2_128 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3) +#undef HWY_TARGET +#define HWY_TARGET HWY_SSSE3 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4) +#undef HWY_TARGET +#define HWY_TARGET HWY_SSE4 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2) +#undef HWY_TARGET +#define HWY_TARGET HWY_AVX2 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3) +#undef HWY_TARGET +#define HWY_TARGET HWY_AVX3 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL) +#undef HWY_TARGET +#define HWY_TARGET HWY_AVX3_DL +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256) +#undef HWY_TARGET +#define HWY_TARGET HWY_WASM_EMU256 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM) +#undef HWY_TARGET +#define HWY_TARGET HWY_WASM +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8) +#undef HWY_TARGET +#define HWY_TARGET HWY_PPC8 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET) + +// Now that all but the static target have been generated, re-enable HWY_EXPORT. +#undef HWY_ONCE +#define HWY_ONCE 1 + +// If we re-include once per enabled target, the translation unit's +// implementation would have to be skipped via #if to avoid redefining symbols. +// We instead skip the re-include for HWY_STATIC_TARGET, and generate its +// implementation when resuming compilation of the translation unit. +#undef HWY_TARGET +#define HWY_TARGET HWY_STATIC_TARGET + +#ifdef HWY_ALREADY_INCLUDED +// Revert the previous toggle to prevent redefinitions for the static target. +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif + +// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored. +#ifdef HWY_SET_MACROS_PER_TARGET +#undef HWY_SET_MACROS_PER_TARGET +#else +#define HWY_SET_MACROS_PER_TARGET +#endif +#endif + +#endif // HIGHWAY_HWY_FOREACH_TARGET_H_ diff --git a/hwy/highway.h b/hwy/highway.h new file mode 100644 index 0000000..4640f31 --- /dev/null +++ b/hwy/highway.h @@ -0,0 +1,378 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This include guard is checked by foreach_target, so avoid the usual _H_ +// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included +// after/outside this include guard. +#ifndef HWY_HIGHWAY_INCLUDED +#define HWY_HIGHWAY_INCLUDED + +// Main header required before using vector types. + +#include "hwy/base.h" +#include "hwy/targets.h" + +namespace hwy { + +// API version (https://semver.org/); keep in sync with CMakeLists.txt. +#define HWY_MAJOR 1 +#define HWY_MINOR 0 +#define HWY_PATCH 2 + +//------------------------------------------------------------------------------ +// Shorthand for tags (defined in shared-inl.h) used to select overloads. +// Note that ScalableTag is preferred over HWY_FULL, and CappedTag over +// HWY_CAPPED(T, N). + +// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of +// registers in the group, and is ignored on targets that do not support groups. +#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag +#define HWY_FULL2(T, LMUL) \ + hwy::HWY_NAMESPACE::ScalableTag +#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3 +// Workaround for MSVC grouping __VA_ARGS__ into a single argument +#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren +// Trailing comma avoids -pedantic false alarm +#define HWY_CHOOSE_FULL(...) \ + HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, )) +#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__) + +// Vector of up to MAX_N lanes. It's better to use full vectors where possible. +#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag + +//------------------------------------------------------------------------------ +// Export user functions for static/dynamic dispatch + +// Evaluates to 0 inside a translation unit if it is generating anything but the +// static target (the last one if multiple targets are enabled). Used to prevent +// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only +// compile once anyway, so this is 1 unless it is or has been included. +#ifndef HWY_ONCE +#define HWY_ONCE 1 +#endif + +// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for +// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is +// defined), and can be used to deduce the return type of Choose*. +#if HWY_STATIC_TARGET == HWY_SCALAR +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_EMU128 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_RVV +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_WASM_EMU256 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_WASM +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_NEON +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SVE +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SVE2 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SVE_256 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SVE2_128 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_PPC8 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SSSE3 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SSE4 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_AVX2 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_AVX3 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_AVX3_DL +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME +#endif + +// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or +// nullptr is that target was not compiled. +#if HWY_TARGETS & HWY_EMU128 +#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME +#elif HWY_TARGETS & HWY_SCALAR +#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME +#else +// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at +// runtime, fall back to the baseline with HWY_STATIC_DISPATCH(). +#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) +#endif + +#if HWY_TARGETS & HWY_WASM_EMU256 +#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME +#else +#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_WASM +#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME +#else +#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_RVV +#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME +#else +#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_NEON +#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME +#else +#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_SVE +#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME +#else +#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_SVE2 +#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME +#else +#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_SVE_256 +#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME +#else +#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_SVE2_128 +#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME +#else +#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_PPC8 +#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME +#else +#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_SSSE3 +#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME +#else +#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_SSE4 +#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME +#else +#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_AVX2 +#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME +#else +#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_AVX3 +#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME +#else +#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_AVX3_DL +#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME +#else +#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr +#endif + +// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall +// apparently cannot be an array. Use a function pointer instead, which has the +// disadvantage that we call the static (not best) target on the first call to +// any HWY_DYNAMIC_DISPATCH. +#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915 +#define HWY_DISPATCH_WORKAROUND 1 +#else +#define HWY_DISPATCH_WORKAROUND 0 +#endif + +// Provides a static member function which is what is called during the first +// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of +// this function are the first entry in the tables created by HWY_EXPORT. +template +struct FunctionCache { + public: + typedef RetType(FunctionType)(Args...); + +#if HWY_DISPATCH_WORKAROUND + template + static RetType ChooseAndCall(Args... args) { + ChosenTarget& chosen_target = GetChosenTarget(); + chosen_target.Update(SupportedTargets()); + return (*func)(args...); + } +#else + // A template function that when instantiated has the same signature as the + // function being called. This function initializes the bit array of targets + // supported by the current CPU and then calls the appropriate entry within + // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any + // exported functions, even those defined by different translation units, + // will dispatch directly to the best available target. + template + static RetType ChooseAndCall(Args... args) { + ChosenTarget& chosen_target = GetChosenTarget(); + chosen_target.Update(SupportedTargets()); + return (table[chosen_target.GetIndex()])(args...); + } +#endif // HWY_DISPATCH_WORKAROUND +}; + +// Used to deduce the template parameters RetType and Args from a function. +template +FunctionCache DeduceFunctionCache(RetType (*)(Args...)) { + return FunctionCache(); +} + +#define HWY_DISPATCH_TABLE(FUNC_NAME) \ + HWY_CONCAT(FUNC_NAME, HighwayDispatchTable) + +// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by +// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This +// static array must be defined at the same namespace level as the function +// it is exporting. +// After being exported, it can be called from other parts of the same source +// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper +// like in the following example: +// +// #include "hwy/highway.h" +// HWY_BEFORE_NAMESPACE(); +// namespace skeleton { +// namespace HWY_NAMESPACE { +// +// void MyFunction(int a, char b, const char* c) { ... } +// +// // NOLINTNEXTLINE(google-readability-namespace-comments) +// } // namespace HWY_NAMESPACE +// } // namespace skeleton +// HWY_AFTER_NAMESPACE(); +// +// namespace skeleton { +// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope. +// +// void MyFunction(int a, char b, const char* c) { +// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c); +// } +// } // namespace skeleton +// + +#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) + +// Simplified version for IDE or the dynamic dispatch case with only one target. +// This case still uses a table, although of a single element, to provide the +// same compile error conditions as with the dynamic dispatch case when multiple +// targets are being compiled. +#define HWY_EXPORT(FUNC_NAME) \ + HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \ + HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)} +#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME) + +#else + +// Simplified version for MSVC 2017: function pointer instead of table. +#if HWY_DISPATCH_WORKAROUND + +#define HWY_EXPORT(FUNC_NAME) \ + static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ + FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \ + /* The first entry in the table initializes the global cache and \ + * calls the function from HWY_STATIC_TARGET. */ \ + &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \ + FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \ + HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ + HWY_CHOOSE_FALLBACK(FUNC_NAME), \ + } + +#else + +// Dynamic dispatch case with one entry per dynamic target plus the fallback +// target and the initialization wrapper. +#define HWY_EXPORT(FUNC_NAME) \ + static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ + FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \ + /* The first entry in the table initializes the global cache and \ + * calls the appropriate function. */ \ + &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \ + FUNC_NAME)))::ChooseAndCall, \ + HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ + HWY_CHOOSE_FALLBACK(FUNC_NAME), \ + } + +#endif // HWY_DISPATCH_WORKAROUND + +#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \ + (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])) + +#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) + +// DEPRECATED names; please use HWY_HAVE_* instead. +#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64 +#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16 +#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64 + +} // namespace hwy + +#endif // HWY_HIGHWAY_INCLUDED + +//------------------------------------------------------------------------------ + +// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want +// to include them once per target, which is ensured by the toggle check. +// Because ops/*.h are included under it, they do not need their own guard. +#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE) +#ifdef HWY_HIGHWAY_PER_TARGET +#undef HWY_HIGHWAY_PER_TARGET +#else +#define HWY_HIGHWAY_PER_TARGET +#endif + +// These define ops inside namespace hwy::HWY_NAMESPACE. +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 +#include "hwy/ops/x86_128-inl.h" +#elif HWY_TARGET == HWY_AVX2 +#include "hwy/ops/x86_256-inl.h" +#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL +#include "hwy/ops/x86_512-inl.h" +#elif HWY_TARGET == HWY_PPC8 +#error "PPC is not yet supported" +#elif HWY_TARGET == HWY_NEON +#include "hwy/ops/arm_neon-inl.h" +#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \ + HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 +#include "hwy/ops/arm_sve-inl.h" +#elif HWY_TARGET == HWY_WASM_EMU256 +#include "hwy/ops/wasm_256-inl.h" +#elif HWY_TARGET == HWY_WASM +#include "hwy/ops/wasm_128-inl.h" +#elif HWY_TARGET == HWY_RVV +#include "hwy/ops/rvv-inl.h" +#elif HWY_TARGET == HWY_EMU128 +#include "hwy/ops/emu128-inl.h" +#elif HWY_TARGET == HWY_SCALAR +#include "hwy/ops/scalar-inl.h" +#else +#pragma message("HWY_TARGET does not match any known target") +#endif // HWY_TARGET + +#include "hwy/ops/generic_ops-inl.h" + +#endif // HWY_HIGHWAY_PER_TARGET diff --git a/hwy/highway_export.h b/hwy/highway_export.h new file mode 100644 index 0000000..30edc17 --- /dev/null +++ b/hwy/highway_export.h @@ -0,0 +1,74 @@ +// Pseudo-generated file to handle both cmake & bazel build system. + +// Initial generation done using cmake code: +// include(GenerateExportHeader) +// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME +// hwy/highway_export.h) +// code reformatted using clang-format --style=Google + +#ifndef HWY_DLLEXPORT_H +#define HWY_DLLEXPORT_H + +#if !defined(HWY_SHARED_DEFINE) +#define HWY_DLLEXPORT +#define HWY_CONTRIB_DLLEXPORT +#define HWY_TEST_DLLEXPORT +#else // !HWY_SHARED_DEFINE + +#ifndef HWY_DLLEXPORT +#if defined(hwy_EXPORTS) +/* We are building this library */ +#ifdef _WIN32 +#define HWY_DLLEXPORT __declspec(dllexport) +#else +#define HWY_DLLEXPORT __attribute__((visibility("default"))) +#endif +#else // defined(hwy_EXPORTS) +/* We are using this library */ +#ifdef _WIN32 +#define HWY_DLLEXPORT __declspec(dllimport) +#else +#define HWY_DLLEXPORT __attribute__((visibility("default"))) +#endif +#endif // defined(hwy_EXPORTS) +#endif // HWY_DLLEXPORT + +#ifndef HWY_CONTRIB_DLLEXPORT +#if defined(hwy_contrib_EXPORTS) +/* We are building this library */ +#ifdef _WIN32 +#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport) +#else +#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default"))) +#endif +#else // defined(hwy_contrib_EXPORTS) +/* We are using this library */ +#ifdef _WIN32 +#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport) +#else +#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default"))) +#endif +#endif // defined(hwy_contrib_EXPORTS) +#endif // HWY_CONTRIB_DLLEXPORT + +#ifndef HWY_TEST_DLLEXPORT +#if defined(hwy_test_EXPORTS) +/* We are building this library */ +#ifdef _WIN32 +#define HWY_TEST_DLLEXPORT __declspec(dllexport) +#else +#define HWY_TEST_DLLEXPORT __attribute__((visibility("default"))) +#endif +#else // defined(hwy_test_EXPORTS) +/* We are using this library */ +#ifdef _WIN32 +#define HWY_TEST_DLLEXPORT __declspec(dllimport) +#else +#define HWY_TEST_DLLEXPORT __attribute__((visibility("default"))) +#endif +#endif // defined(hwy_test_EXPORTS) +#endif // HWY_TEST_DLLEXPORT + +#endif // !HWY_SHARED_DEFINE + +#endif /* HWY_DLLEXPORT_H */ diff --git a/hwy/highway_test.cc b/hwy/highway_test.cc new file mode 100644 index 0000000..4838e72 --- /dev/null +++ b/hwy/highway_test.cc @@ -0,0 +1,485 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "highway_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/nanobenchmark.h" // Unpredictable1 +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +HWY_NOINLINE void TestCappedLimit(T /* tag */) { + CappedTag d; + // Ensure two ops compile + HWY_ASSERT_VEC_EQ(d, Zero(d), Set(d, T{0})); + + // Ensure we do not write more than kLimit lanes + const size_t N = Lanes(d); + if (kLimit < N) { + auto lanes = AllocateAligned(N); + std::fill(lanes.get(), lanes.get() + N, T{0}); + Store(Set(d, T{1}), d, lanes.get()); + for (size_t i = kLimit; i < N; ++i) { + HWY_ASSERT_EQ(lanes[i], T{0}); + } + } +} + +// Adapter for ForAllTypes - we are constructing our own Simd<> and thus do not +// use ForPartialVectors etc. +struct TestCapped { + template + void operator()(T t) const { + TestCappedLimit<1>(t); + TestCappedLimit<3>(t); + TestCappedLimit<5>(t); + TestCappedLimit<1ull << 15>(t); + } +}; + +HWY_NOINLINE void TestAllCapped() { ForAllTypes(TestCapped()); } + +// For testing that ForPartialVectors reaches every possible size: +using NumLanesSet = std::bitset; + +// Monostate pattern because ForPartialVectors takes a template argument, not a +// functor by reference. +static NumLanesSet* NumLanesForSize(size_t sizeof_t) { + HWY_ASSERT(sizeof_t <= sizeof(uint64_t)); + static NumLanesSet num_lanes[sizeof(uint64_t) + 1]; + return num_lanes + sizeof_t; +} +static size_t* MaxLanesForSize(size_t sizeof_t) { + HWY_ASSERT(sizeof_t <= sizeof(uint64_t)); + static size_t num_lanes[sizeof(uint64_t) + 1] = {0}; + return num_lanes + sizeof_t; +} + +struct TestMaxLanes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const size_t kMax = MaxLanes(d); // for RVV, includes LMUL + HWY_ASSERT(N <= kMax); + HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T))); + + NumLanesForSize(sizeof(T))->set(N); + *MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N); + } +}; + +HWY_NOINLINE void TestAllMaxLanes() { + ForAllTypes(ForPartialVectors()); + + // Ensure ForPartialVectors visited all powers of two [1, N]. + for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t), + sizeof(uint64_t)}) { + const size_t N = *MaxLanesForSize(sizeof_t); + for (size_t i = 1; i <= N; i += i) { + if (!NumLanesForSize(sizeof_t)->test(i)) { + fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n", + static_cast(sizeof_t), static_cast(i), + static_cast(N)); + HWY_ASSERT(false); + } + } + } +} + +struct TestSet { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Zero + const auto v0 = Zero(d); + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + std::fill(expected.get(), expected.get() + N, T(0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), v0); + + // Set + const auto v2 = Set(d, T(2)); + for (size_t i = 0; i < N; ++i) { + expected[i] = 2; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), v2); + + // Iota + const auto vi = Iota(d, T(5)); + for (size_t i = 0; i < N; ++i) { + expected[i] = T(5 + i); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), vi); + + // Undefined + const auto vu = Undefined(d); + Store(vu, d, expected.get()); + } +}; + +HWY_NOINLINE void TestAllSet() { ForAllTypes(ForPartialVectors()); } + +// Ensures wraparound (mod 2^bits) +struct TestOverflow { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Set(d, T(1)); + const auto vmax = Set(d, LimitsMax()); + const auto vmin = Set(d, LimitsMin()); + // Unsigned underflow / negative -> positive + HWY_ASSERT_VEC_EQ(d, vmax, Sub(vmin, v1)); + // Unsigned overflow / positive -> negative + HWY_ASSERT_VEC_EQ(d, vmin, Add(vmax, v1)); + } +}; + +HWY_NOINLINE void TestAllOverflow() { + ForIntegerTypes(ForPartialVectors()); +} + +struct TestClamp { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto v1 = Set(d, 1); + const auto v2 = Set(d, 2); + + HWY_ASSERT_VEC_EQ(d, v1, Clamp(v2, v0, v1)); + HWY_ASSERT_VEC_EQ(d, v1, Clamp(v0, v1, v2)); + } +}; + +HWY_NOINLINE void TestAllClamp() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSignBitInteger { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto all = VecFromMask(d, Eq(v0, v0)); + const auto vs = SignBit(d); + const auto other = Sub(vs, Set(d, 1)); + + // Shifting left by one => overflow, equal zero + HWY_ASSERT_VEC_EQ(d, v0, Add(vs, vs)); + // Verify the lower bits are zero (only +/- and logical ops are available + // for all types) + HWY_ASSERT_VEC_EQ(d, all, Add(vs, other)); + } +}; + +struct TestSignBitFloat { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vs = SignBit(d); + const auto vp = Set(d, 2.25); + const auto vn = Set(d, -2.25); + HWY_ASSERT_VEC_EQ(d, Or(vp, vs), vn); + HWY_ASSERT_VEC_EQ(d, AndNot(vs, vn), vp); + HWY_ASSERT_VEC_EQ(d, v0, vs); + } +}; + +HWY_NOINLINE void TestAllSignBit() { + ForIntegerTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); +} + +// inline to work around incorrect SVE codegen (only first 128 bits used). +template +HWY_INLINE void AssertNaN(D d, VecArg v, const char* file, int line) { + using T = TFromD; + const size_t N = Lanes(d); + if (!AllTrue(d, IsNaN(v))) { + Print(d, "not all NaN", v, 0, N); + Print(d, "mask", VecFromMask(d, IsNaN(v)), 0, N); + const std::string type_name = TypeName(T(), N); + // RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to + // avoid truncating doubles. + uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0}; + const T lane = GetLane(v); + CopyBytes(&lane, bytes); + Abort(file, line, + "Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x " + "%02x)", + type_name.c_str(), lane, bytes[0], bytes[1], bytes[2], bytes[3], + bytes[4], bytes[5], bytes[6], bytes[7]); + } +} + +#define HWY_ASSERT_NAN(d, v) AssertNaN(d, v, __FILE__, __LINE__) + +struct TestNaN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Set(d, T(Unpredictable1())); + const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1); + HWY_ASSERT_NAN(d, nan); + + // Arithmetic + HWY_ASSERT_NAN(d, Add(nan, v1)); + HWY_ASSERT_NAN(d, Add(v1, nan)); + HWY_ASSERT_NAN(d, Sub(nan, v1)); + HWY_ASSERT_NAN(d, Sub(v1, nan)); + HWY_ASSERT_NAN(d, Mul(nan, v1)); + HWY_ASSERT_NAN(d, Mul(v1, nan)); + HWY_ASSERT_NAN(d, Div(nan, v1)); + HWY_ASSERT_NAN(d, Div(v1, nan)); + + // FMA + HWY_ASSERT_NAN(d, MulAdd(nan, v1, v1)); + HWY_ASSERT_NAN(d, MulAdd(v1, nan, v1)); + HWY_ASSERT_NAN(d, MulAdd(v1, v1, nan)); + HWY_ASSERT_NAN(d, MulSub(nan, v1, v1)); + HWY_ASSERT_NAN(d, MulSub(v1, nan, v1)); + HWY_ASSERT_NAN(d, MulSub(v1, v1, nan)); + HWY_ASSERT_NAN(d, NegMulAdd(nan, v1, v1)); + HWY_ASSERT_NAN(d, NegMulAdd(v1, nan, v1)); + HWY_ASSERT_NAN(d, NegMulAdd(v1, v1, nan)); + HWY_ASSERT_NAN(d, NegMulSub(nan, v1, v1)); + HWY_ASSERT_NAN(d, NegMulSub(v1, nan, v1)); + HWY_ASSERT_NAN(d, NegMulSub(v1, v1, nan)); + + // Rcp/Sqrt + HWY_ASSERT_NAN(d, Sqrt(nan)); + + // Sign manipulation + HWY_ASSERT_NAN(d, Abs(nan)); + HWY_ASSERT_NAN(d, Neg(nan)); + HWY_ASSERT_NAN(d, CopySign(nan, v1)); + HWY_ASSERT_NAN(d, CopySignToAbs(nan, v1)); + + // Rounding + HWY_ASSERT_NAN(d, Ceil(nan)); + HWY_ASSERT_NAN(d, Floor(nan)); + HWY_ASSERT_NAN(d, Round(nan)); + HWY_ASSERT_NAN(d, Trunc(nan)); + + // Logical (And/AndNot/Xor will clear NaN!) + HWY_ASSERT_NAN(d, Or(nan, v1)); + + // Comparison + HWY_ASSERT(AllFalse(d, Eq(nan, v1))); + HWY_ASSERT(AllFalse(d, Gt(nan, v1))); + HWY_ASSERT(AllFalse(d, Lt(nan, v1))); + HWY_ASSERT(AllFalse(d, Ge(nan, v1))); + HWY_ASSERT(AllFalse(d, Le(nan, v1))); + + // Reduction + HWY_ASSERT_NAN(d, SumOfLanes(d, nan)); +// TODO(janwas): re-enable after QEMU/Spike are fixed +#if HWY_TARGET != HWY_RVV + HWY_ASSERT_NAN(d, MinOfLanes(d, nan)); + HWY_ASSERT_NAN(d, MaxOfLanes(d, nan)); +#endif + + // Min +#if HWY_ARCH_X86 && (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128) + // x86 SIMD returns the second operand if any input is NaN. + HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1)); + HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1)); + HWY_ASSERT_NAN(d, Min(v1, nan)); + HWY_ASSERT_NAN(d, Max(v1, nan)); +#elif HWY_ARCH_WASM + // Should return NaN if any input is NaN, but does not for scalar. + // TODO(janwas): remove once this is fixed. +#elif HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7 + // ARMv7 NEON returns NaN if any input is NaN. + HWY_ASSERT_NAN(d, Min(v1, nan)); + HWY_ASSERT_NAN(d, Max(v1, nan)); + HWY_ASSERT_NAN(d, Min(nan, v1)); + HWY_ASSERT_NAN(d, Max(nan, v1)); +#else + // IEEE 754-2019 minimumNumber is defined as the other argument if exactly + // one is NaN, and qNaN if both are. + HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1)); + HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1)); + HWY_ASSERT_VEC_EQ(d, v1, Min(v1, nan)); + HWY_ASSERT_VEC_EQ(d, v1, Max(v1, nan)); +#endif + HWY_ASSERT_NAN(d, Min(nan, nan)); + HWY_ASSERT_NAN(d, Max(nan, nan)); + } +}; + +// For functions only available for float32 +struct TestF32NaN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Set(d, T(Unpredictable1())); + const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1); + HWY_ASSERT_NAN(d, ApproximateReciprocal(nan)); + HWY_ASSERT_NAN(d, ApproximateReciprocalSqrt(nan)); + HWY_ASSERT_NAN(d, AbsDiff(nan, v1)); + HWY_ASSERT_NAN(d, AbsDiff(v1, nan)); + } +}; + +HWY_NOINLINE void TestAllNaN() { + ForFloatTypes(ForPartialVectors()); + ForPartialVectors()(float()); +} + +struct TestIsNaN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Set(d, T(Unpredictable1())); + const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1); + const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1); + const auto neg = Set(d, T{-1}); + HWY_ASSERT_NAN(d, nan); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(inf)); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(CopySign(inf, neg))); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(nan)); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(CopySign(nan, neg))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(v1)); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Zero(d))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::LowestValue()))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::HighestValue()))); + } +}; + +HWY_NOINLINE void TestAllIsNaN() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestIsInf { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Set(d, T(Unpredictable1())); + const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1); + const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1); + const auto neg = Set(d, T{-1}); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(inf)); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(CopySign(inf, neg))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(nan)); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(CopySign(nan, neg))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(v1)); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Zero(d))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::LowestValue()))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::HighestValue()))); + } +}; + +HWY_NOINLINE void TestAllIsInf() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestIsFinite { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Set(d, T(Unpredictable1())); + const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1); + const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1); + const auto neg = Set(d, T{-1}); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(inf)); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(inf, neg))); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(nan)); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(nan, neg))); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(v1)); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Zero(d))); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Set(d, hwy::LowestValue()))); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), + IsFinite(Set(d, hwy::HighestValue()))); + } +}; + +HWY_NOINLINE void TestAllIsFinite() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestCopyAndAssign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // copy V + const auto v3 = Iota(d, 3); + auto v3b(v3); + HWY_ASSERT_VEC_EQ(d, v3, v3b); + + // assign V + auto v3c = Undefined(d); + v3c = v3; + HWY_ASSERT_VEC_EQ(d, v3, v3c); + } +}; + +HWY_NOINLINE void TestAllCopyAndAssign() { + ForAllTypes(ForPartialVectors()); +} + +struct TestGetLane { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + HWY_ASSERT_EQ(T(0), GetLane(Zero(d))); + HWY_ASSERT_EQ(T(1), GetLane(Set(d, 1))); + } +}; + +HWY_NOINLINE void TestAllGetLane() { + ForAllTypes(ForPartialVectors()); +} + +struct TestDFromV { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + using D0 = DFromV; // not necessarily same as D + const auto v0b = And(v0, Set(D0(), 1)); // but vectors can interoperate + HWY_ASSERT_VEC_EQ(d, v0, v0b); + } +}; + +HWY_NOINLINE void TestAllDFromV() { + ForAllTypes(ForPartialVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HighwayTest); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCapped); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsNaN); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsInf); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsFinite); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane); +HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV); +} // namespace hwy + +#endif diff --git a/hwy/hwy.version b/hwy/hwy.version new file mode 100644 index 0000000..9ff6be6 --- /dev/null +++ b/hwy/hwy.version @@ -0,0 +1,19 @@ +HWY_0 { + global: + extern "C++" { + *hwy::*; + }; + + local: + # Hide all the std namespace symbols. std namespace is explicitly marked + # as visibility(default) and header-only functions or methods (such as those + # from templates) should be exposed in shared libraries as weak symbols but + # this is only needed when we expose those types in the shared library API + # in any way. We don't use C++ std types in the API and we also don't + # support exceptions in the library. + # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion + # about this. + extern "C++" { + *std::*; + }; +}; diff --git a/hwy/nanobenchmark.cc b/hwy/nanobenchmark.cc new file mode 100644 index 0000000..e03ed4c --- /dev/null +++ b/hwy/nanobenchmark.cc @@ -0,0 +1,762 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/nanobenchmark.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include +#include +#include +#include // clock_gettime + +#include // sort +#include +#include +#include //NOLINT +#include +#include // iota +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#include +#endif + +#if defined(__APPLE__) +#include +#include +#endif + +#if defined(__HAIKU__) +#include +#endif + +#include "hwy/base.h" +#if HWY_ARCH_PPC && defined(__GLIBC__) +#include // NOLINT __ppc_get_timebase_freq +#elif HWY_ARCH_X86 + +#if HWY_COMPILER_MSVC +#include +#else +#include // NOLINT +#endif // HWY_COMPILER_MSVC + +#endif // HWY_ARCH_X86 + +namespace hwy { +namespace { +namespace timer { + +// Ticks := platform-specific timer values (CPU cycles on x86). Must be +// unsigned to guarantee wraparound on overflow. +using Ticks = uint64_t; + +// Start/Stop return absolute timestamps and must be placed immediately before +// and after the region to measure. We provide separate Start/Stop functions +// because they use different fences. +// +// Background: RDTSC is not 'serializing'; earlier instructions may complete +// after it, and/or later instructions may complete before it. 'Fences' ensure +// regions' elapsed times are independent of such reordering. The only +// documented unprivileged serializing instruction is CPUID, which acts as a +// full fence (no reordering across it in either direction). Unfortunately +// the latency of CPUID varies wildly (perhaps made worse by not initializing +// its EAX input). Because it cannot reliably be deducted from the region's +// elapsed time, it must not be included in the region to measure (i.e. +// between the two RDTSC). +// +// The newer RDTSCP is sometimes described as serializing, but it actually +// only serves as a half-fence with release semantics. Although all +// instructions in the region will complete before the final timestamp is +// captured, subsequent instructions may leak into the region and increase the +// elapsed time. Inserting another fence after the final RDTSCP would prevent +// such reordering without affecting the measured region. +// +// Fortunately, such a fence exists. The LFENCE instruction is only documented +// to delay later loads until earlier loads are visible. However, Intel's +// reference manual says it acts as a full fence (waiting until all earlier +// instructions have completed, and delaying later instructions until it +// completes). AMD assigns the same behavior to MFENCE. +// +// We need a fence before the initial RDTSC to prevent earlier instructions +// from leaking into the region, and arguably another after RDTSC to avoid +// region instructions from completing before the timestamp is recorded. +// When surrounded by fences, the additional RDTSCP half-fence provides no +// benefit, so the initial timestamp can be recorded via RDTSC, which has +// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, +// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. +// +// Using Start+Start leads to higher variance and overhead than Stop+Stop. +// However, Stop+Stop includes an LFENCE in the region measurements, which +// adds a delay dependent on earlier loads. The combination of Start+Stop +// is faster than Start+Start and more consistent than Stop+Stop because +// the first LFENCE already delayed subsequent loads before the measured +// region. This combination seems not to have been considered in prior work: +// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c +// +// Note: performance counters can measure 'exact' instructions-retired or +// (unhalted) cycle counts. The RDPMC instruction is not serializing and also +// requires fences. Unfortunately, it is not accessible on all OSes and we +// prefer to avoid kernel-mode drivers. Performance counters are also affected +// by several under/over-count errata, so we use the TSC instead. + +// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, +// divide by InvariantTicksPerSecond. +inline Ticks Start() { + Ticks t; +#if HWY_ARCH_PPC && defined(__GLIBC__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC + // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. + asm volatile("mrs %0, cntvct_el0" : "=r"(t)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + _ReadWriteBarrier(); + _mm_lfence(); + _ReadWriteBarrier(); + t = __rdtsc(); + _ReadWriteBarrier(); + _mm_lfence(); + _ReadWriteBarrier(); +#elif HWY_ARCH_X86_64 + asm volatile( + "lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rdx", "memory", "cc"); +#elif HWY_ARCH_RVV + asm volatile("rdtime %0" : "=r"(t)); +#elif defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + (void)QueryPerformanceCounter(&counter); + t = counter.QuadPart; +#elif defined(__APPLE__) + t = mach_absolute_time(); +#elif defined(__HAIKU__) + t = system_time_nsecs(); // since boot +#else // POSIX + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + t = static_cast(ts.tv_sec * 1000000000LL + ts.tv_nsec); +#endif + return t; +} + +// WARNING: on x86, caller must check HasRDTSCP before using this! +inline Ticks Stop() { + uint64_t t; +#if HWY_ARCH_PPC && defined(__GLIBC__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC + // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. + asm volatile("mrs %0, cntvct_el0" : "=r"(t)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + _ReadWriteBarrier(); + unsigned aux; + t = __rdtscp(&aux); + _ReadWriteBarrier(); + _mm_lfence(); + _ReadWriteBarrier(); +#elif HWY_ARCH_X86_64 + // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). + asm volatile( + "rdtscp\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rcx", "rdx", "memory", "cc"); +#else + t = Start(); +#endif + return t; +} + +} // namespace timer + +namespace robust_statistics { + +// Sorts integral values in ascending order (e.g. for Mode). About 3x faster +// than std::sort for input distributions with very few unique values. +template +void CountingSort(T* values, size_t num_values) { + // Unique values and their frequency (similar to flat_map). + using Unique = std::pair; + std::vector unique; + for (size_t i = 0; i < num_values; ++i) { + const T value = values[i]; + const auto pos = + std::find_if(unique.begin(), unique.end(), + [value](const Unique u) { return u.first == value; }); + if (pos == unique.end()) { + unique.push_back(std::make_pair(value, 1)); + } else { + ++pos->second; + } + } + + // Sort in ascending order of value (pair.first). + std::sort(unique.begin(), unique.end()); + + // Write that many copies of each unique value to the array. + T* HWY_RESTRICT p = values; + for (const auto& value_count : unique) { + std::fill(p, p + value_count.second, value_count.first); + p += value_count.second; + } + NANOBENCHMARK_CHECK(p == values + num_values); +} + +// @return i in [idx_begin, idx_begin + half_count) that minimizes +// sorted[i + half_count] - sorted[i]. +template +size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin, + const size_t half_count) { + T min_range = std::numeric_limits::max(); + size_t min_idx = 0; + + for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) { + NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]); + const T range = sorted[idx + half_count] - sorted[idx]; + if (range < min_range) { + min_range = range; + min_idx = idx; + } + } + + return min_idx; +} + +// Returns an estimate of the mode by calling MinRange on successively +// halved intervals. "sorted" must be in ascending order. This is the +// Half Sample Mode estimator proposed by Bickel in "On a fast, robust +// estimator of the mode", with complexity O(N log N). The mode is less +// affected by outliers in highly-skewed distributions than the median. +// The averaging operation below assumes "T" is an unsigned integer type. +template +T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) { + size_t idx_begin = 0; + size_t half_count = num_values / 2; + while (half_count > 1) { + idx_begin = MinRange(sorted, idx_begin, half_count); + half_count >>= 1; + } + + const T x = sorted[idx_begin + 0]; + if (half_count == 0) { + return x; + } + NANOBENCHMARK_CHECK(half_count == 1); + const T average = (x + sorted[idx_begin + 1] + 1) / 2; + return average; +} + +// Returns the mode. Side effect: sorts "values". +template +T Mode(T* values, const size_t num_values) { + CountingSort(values, num_values); + return ModeOfSorted(values, num_values); +} + +template +T Mode(T (&values)[N]) { + return Mode(&values[0], N); +} + +// Returns the median value. Side effect: sorts "values". +template +T Median(T* values, const size_t num_values) { + NANOBENCHMARK_CHECK(!values->empty()); + std::sort(values, values + num_values); + const size_t half = num_values / 2; + // Odd count: return middle + if (num_values % 2) { + return values[half]; + } + // Even count: return average of middle two. + return (values[half] + values[half - 1] + 1) / 2; +} + +// Returns a robust measure of variability. +template +T MedianAbsoluteDeviation(const T* values, const size_t num_values, + const T median) { + NANOBENCHMARK_CHECK(num_values != 0); + std::vector abs_deviations; + abs_deviations.reserve(num_values); + for (size_t i = 0; i < num_values; ++i) { + const int64_t abs = std::abs(static_cast(values[i]) - + static_cast(median)); + abs_deviations.push_back(static_cast(abs)); + } + return Median(abs_deviations.data(), num_values); +} + +} // namespace robust_statistics +} // namespace +namespace platform { +namespace { + +// Prevents the compiler from eliding the computations that led to "output". +template +inline void PreventElision(T&& output) { +#if HWY_COMPILER_MSVC == 0 + // Works by indicating to the compiler that "output" is being read and + // modified. The +r constraint avoids unnecessary writes to memory, but only + // works for built-in types (typically FuncOutput). + asm volatile("" : "+r"(output) : : "memory"); +#else + // MSVC does not support inline assembly anymore (and never supported GCC's + // RTL constraints). Self-assignment with #pragma optimize("off") might be + // expected to prevent elision, but it does not with MSVC 2015. Type-punning + // with volatile pointers generates inefficient code on MSVC 2017. + static std::atomic dummy(T{}); + dummy.store(output, std::memory_order_relaxed); +#endif +} + +// Measures the actual current frequency of Ticks. We cannot rely on the nominal +// frequency encoded in x86 BrandString because it is misleading on M1 Rosetta, +// and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also +// used on RISC-V and ARM64. +HWY_MAYBE_UNUSED double MeasureNominalClockRate() { + double max_ticks_per_sec = 0.0; + // Arbitrary, enough to ignore 2 outliers without excessive init time. + for (int rep = 0; rep < 3; ++rep) { + auto time0 = std::chrono::steady_clock::now(); + using Time = decltype(time0); + const timer::Ticks ticks0 = timer::Start(); + const Time time_min = time0 + std::chrono::milliseconds(10); + + Time time1; + timer::Ticks ticks1; + for (;;) { + time1 = std::chrono::steady_clock::now(); + // Ideally this would be Stop, but that requires RDTSCP on x86. To avoid + // another codepath, just use Start instead. now() presumably has its own + // fence-like behavior. + ticks1 = timer::Start(); // Do not use Stop, see comment above + if (time1 >= time_min) break; + } + + const double dticks = static_cast(ticks1 - ticks0); + std::chrono::duration> dtime = time1 - time0; + const double ticks_per_sec = dticks / dtime.count(); + max_ticks_per_sec = std::max(max_ticks_per_sec, ticks_per_sec); + } + return max_ticks_per_sec; +} + +#if HWY_ARCH_X86 + +void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HWY_RESTRICT abcd) { +#if HWY_COMPILER_MSVC + int regs[4]; + __cpuidex(regs, level, count); + for (int i = 0; i < 4; ++i) { + abcd[i] = regs[i]; + } +#else + uint32_t a; + uint32_t b; + uint32_t c; + uint32_t d; + __cpuid_count(level, count, a, b, c, d); + abcd[0] = a; + abcd[1] = b; + abcd[2] = c; + abcd[3] = d; +#endif +} + +bool HasRDTSCP() { + uint32_t abcd[4]; + Cpuid(0x80000001U, 0, abcd); // Extended feature flags + return (abcd[3] & (1u << 27)) != 0; // RDTSCP +} + +std::string BrandString() { + char brand_string[49]; + std::array abcd; + + // Check if brand string is supported (it is on all reasonable Intel/AMD) + Cpuid(0x80000000U, 0, abcd.data()); + if (abcd[0] < 0x80000004U) { + return std::string(); + } + + for (size_t i = 0; i < 3; ++i) { + Cpuid(static_cast(0x80000002U + i), 0, abcd.data()); + CopyBytes(&abcd[0], brand_string + i * 16); // not same size + } + brand_string[48] = 0; + return brand_string; +} + +#endif // HWY_ARCH_X86 + +} // namespace + +HWY_DLLEXPORT double InvariantTicksPerSecond() { +#if HWY_ARCH_PPC && defined(__GLIBC__) + return static_cast(__ppc_get_timebase_freq()); +#elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) + // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs. + static const double freq = MeasureNominalClockRate(); + return freq; +#elif defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER freq; + (void)QueryPerformanceFrequency(&freq); + return static_cast(freq.QuadPart); +#elif defined(__APPLE__) + // https://developer.apple.com/library/mac/qa/qa1398/_index.html + mach_timebase_info_data_t timebase; + (void)mach_timebase_info(&timebase); + return static_cast(timebase.denom) / timebase.numer * 1E9; +#else + return 1E9; // Haiku and clock_gettime return nanoseconds. +#endif +} + +HWY_DLLEXPORT double Now() { + static const double mul = 1.0 / InvariantTicksPerSecond(); + return static_cast(timer::Start()) * mul; +} + +HWY_DLLEXPORT uint64_t TimerResolution() { +#if HWY_ARCH_X86 + bool can_use_stop = platform::HasRDTSCP(); +#else + constexpr bool can_use_stop = true; +#endif + + // Nested loop avoids exceeding stack/L1 capacity. + timer::Ticks repetitions[Params::kTimerSamples]; + for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) { + timer::Ticks samples[Params::kTimerSamples]; + if (can_use_stop) { + for (size_t i = 0; i < Params::kTimerSamples; ++i) { + const timer::Ticks t0 = timer::Start(); + const timer::Ticks t1 = timer::Stop(); // we checked HasRDTSCP above + samples[i] = t1 - t0; + } + } else { + for (size_t i = 0; i < Params::kTimerSamples; ++i) { + const timer::Ticks t0 = timer::Start(); + const timer::Ticks t1 = timer::Start(); // do not use Stop, see above + samples[i] = t1 - t0; + } + } + repetitions[rep] = robust_statistics::Mode(samples); + } + return robust_statistics::Mode(repetitions); +} + +} // namespace platform +namespace { + +static const timer::Ticks timer_resolution = platform::TimerResolution(); + +// Estimates the expected value of "lambda" values with a variable number of +// samples until the variability "rel_mad" is less than "max_rel_mad". +template +timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, + const Params& p, const Lambda& lambda) { + // Choose initial samples_per_eval based on a single estimated duration. + timer::Ticks t0 = timer::Start(); + lambda(); + timer::Ticks t1 = timer::Stop(); // Caller checks HasRDTSCP + timer::Ticks est = t1 - t0; + static const double ticks_per_second = platform::InvariantTicksPerSecond(); + const size_t ticks_per_eval = + static_cast(ticks_per_second * p.seconds_per_eval); + size_t samples_per_eval = est == 0 + ? p.min_samples_per_eval + : static_cast(ticks_per_eval / est); + samples_per_eval = HWY_MAX(samples_per_eval, p.min_samples_per_eval); + + std::vector samples; + samples.reserve(1 + samples_per_eval); + samples.push_back(est); + + // Percentage is too strict for tiny differences, so also allow a small + // absolute "median absolute deviation". + const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100; + *rel_mad = 0.0; // ensure initialized + + for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) { + samples.reserve(samples.size() + samples_per_eval); + for (size_t i = 0; i < samples_per_eval; ++i) { + t0 = timer::Start(); + lambda(); + t1 = timer::Stop(); // Caller checks HasRDTSCP + samples.push_back(t1 - t0); + } + + if (samples.size() >= p.min_mode_samples) { + est = robust_statistics::Mode(samples.data(), samples.size()); + } else { + // For "few" (depends also on the variance) samples, Median is safer. + est = robust_statistics::Median(samples.data(), samples.size()); + } + NANOBENCHMARK_CHECK(est != 0); + + // Median absolute deviation (mad) is a robust measure of 'variability'. + const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( + samples.data(), samples.size(), est); + *rel_mad = static_cast(abs_mad) / static_cast(est); + + if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) { + if (p.verbose) { + printf("%6" PRIu64 " samples => %5" PRIu64 " (abs_mad=%4" PRIu64 + ", rel_mad=%4.2f%%)\n", + static_cast(samples.size()), + static_cast(est), static_cast(abs_mad), + *rel_mad * 100.0); + } + return est; + } + } + + if (p.verbose) { + printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6" PRIu64 + " samples.\n", + *rel_mad * 100.0, max_rel_mad * 100.0, + static_cast(samples.size())); + } + return est; +} + +using InputVec = std::vector; + +// Returns vector of unique input values. +InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) { + InputVec unique(inputs, inputs + num_inputs); + std::sort(unique.begin(), unique.end()); + unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); + return unique; +} + +// Returns how often we need to call func for sufficient precision. +size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, + const Params& p) { + // Min elapsed ticks for any input. + timer::Ticks min_duration = ~timer::Ticks(0); + + for (const FuncInput input : unique) { + double rel_mad; + const timer::Ticks total = SampleUntilStable( + p.target_rel_mad, &rel_mad, p, + [func, arg, input]() { platform::PreventElision(func(arg, input)); }); + min_duration = HWY_MIN(min_duration, total - timer_resolution); + } + + // Number of repetitions required to reach the target resolution. + const size_t max_skip = p.precision_divisor; + // Number of repetitions given the estimated duration. + const size_t num_skip = + min_duration == 0 + ? 0 + : static_cast((max_skip + min_duration - 1) / min_duration); + if (p.verbose) { + printf("res=%" PRIu64 " max_skip=%" PRIu64 " min_dur=%" PRIu64 + " num_skip=%" PRIu64 "\n", + static_cast(timer_resolution), + static_cast(max_skip), static_cast(min_duration), + static_cast(num_skip)); + } + return num_skip; +} + +// Replicates inputs until we can omit "num_skip" occurrences of an input. +InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs, + const size_t num_unique, const size_t num_skip, + const Params& p) { + InputVec full; + if (num_unique == 1) { + full.assign(p.subset_ratio * num_skip, inputs[0]); + return full; + } + + full.reserve(p.subset_ratio * num_skip * num_inputs); + for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) { + full.insert(full.end(), inputs, inputs + num_inputs); + } + std::mt19937 rng; + std::shuffle(full.begin(), full.end(), rng); + return full; +} + +// Copies the "full" to "subset" in the same order, but with "num_skip" +// randomly selected occurrences of "input_to_skip" removed. +void FillSubset(const InputVec& full, const FuncInput input_to_skip, + const size_t num_skip, InputVec* subset) { + const size_t count = + static_cast(std::count(full.begin(), full.end(), input_to_skip)); + // Generate num_skip random indices: which occurrence to skip. + std::vector omit(count); + std::iota(omit.begin(), omit.end(), 0); + // omit[] is the same on every call, but that's OK because they identify the + // Nth instance of input_to_skip, so the position within full[] differs. + std::mt19937 rng; + std::shuffle(omit.begin(), omit.end(), rng); + omit.resize(num_skip); + std::sort(omit.begin(), omit.end()); + + uint32_t occurrence = ~0u; // 0 after preincrement + size_t idx_omit = 0; // cursor within omit[] + size_t idx_subset = 0; // cursor within *subset + for (const FuncInput next : full) { + if (next == input_to_skip) { + ++occurrence; + // Haven't removed enough already + if (idx_omit < num_skip) { + // This one is up for removal + if (occurrence == omit[idx_omit]) { + ++idx_omit; + continue; + } + } + } + if (idx_subset < subset->size()) { + (*subset)[idx_subset++] = next; + } + } + NANOBENCHMARK_CHECK(idx_subset == subset->size()); + NANOBENCHMARK_CHECK(idx_omit == omit.size()); + NANOBENCHMARK_CHECK(occurrence == count - 1); +} + +// Returns total ticks elapsed for all inputs. +timer::Ticks TotalDuration(const Func func, const uint8_t* arg, + const InputVec* inputs, const Params& p, + double* max_rel_mad) { + double rel_mad; + const timer::Ticks duration = + SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() { + for (const FuncInput input : *inputs) { + platform::PreventElision(func(arg, input)); + } + }); + *max_rel_mad = HWY_MAX(*max_rel_mad, rel_mad); + return duration; +} + +// (Nearly) empty Func for measuring timer overhead/resolution. +HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) { + return input; +} + +// Returns overhead of accessing inputs[] and calling a function; this will +// be deducted from future TotalDuration return values. +timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs, + const Params& p) { + double rel_mad; + // Zero tolerance because repeatability is crucial and EmptyFunc is fast. + return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() { + for (const FuncInput input : *inputs) { + platform::PreventElision(EmptyFunc(arg, input)); + } + }); +} + +} // namespace + +HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; } + +HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, + const FuncInput* inputs, const size_t num_inputs, + Result* results, const Params& p) { + NANOBENCHMARK_CHECK(num_inputs != 0); + +#if HWY_ARCH_X86 + if (!platform::HasRDTSCP()) { + fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n", + platform::BrandString().c_str()); + return 0; + } +#endif + + const InputVec& unique = UniqueInputs(inputs, num_inputs); + + const size_t num_skip = NumSkip(func, arg, unique, p); // never 0 + if (num_skip == 0) return 0; // NumSkip already printed error message + // (slightly less work on x86 to cast from signed integer) + const float mul = 1.0f / static_cast(static_cast(num_skip)); + + const InputVec& full = + ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p); + InputVec subset(full.size() - num_skip); + + const timer::Ticks overhead = Overhead(arg, &full, p); + const timer::Ticks overhead_skip = Overhead(arg, &subset, p); + if (overhead < overhead_skip) { + fprintf(stderr, "Measurement failed: overhead %" PRIu64 " < %" PRIu64 "\n", + static_cast(overhead), + static_cast(overhead_skip)); + return 0; + } + + if (p.verbose) { + printf("#inputs=%5" PRIu64 ",%5" PRIu64 " overhead=%5" PRIu64 ",%5" PRIu64 + "\n", + static_cast(full.size()), + static_cast(subset.size()), + static_cast(overhead), + static_cast(overhead_skip)); + } + + double max_rel_mad = 0.0; + const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); + + for (size_t i = 0; i < unique.size(); ++i) { + FillSubset(full, unique[i], num_skip, &subset); + const timer::Ticks total_skip = + TotalDuration(func, arg, &subset, p, &max_rel_mad); + + if (total < total_skip) { + fprintf(stderr, "Measurement failed: total %" PRIu64 " < %" PRIu64 "\n", + static_cast(total), static_cast(total_skip)); + return 0; + } + + const timer::Ticks duration = + (total - overhead) - (total_skip - overhead_skip); + results[i].input = unique[i]; + results[i].ticks = static_cast(duration) * mul; + results[i].variability = static_cast(max_rel_mad); + } + + return unique.size(); +} + +} // namespace hwy diff --git a/hwy/nanobenchmark.h b/hwy/nanobenchmark.h new file mode 100644 index 0000000..f0910b4 --- /dev/null +++ b/hwy/nanobenchmark.h @@ -0,0 +1,194 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_ +#define HIGHWAY_HWY_NANOBENCHMARK_H_ + +// Benchmarks functions of a single integer argument with realistic branch +// prediction hit rates. Uses a robust estimator to summarize the measurements. +// The precision is about 0.2%. +// +// Examples: see nanobenchmark_test.cc. +// +// Background: Microbenchmarks such as http://github.com/google/benchmark +// can measure elapsed times on the order of a microsecond. Shorter functions +// are typically measured by repeating them thousands of times and dividing +// the total elapsed time by this count. Unfortunately, repetition (especially +// with the same input parameter!) influences the runtime. In time-critical +// code, it is reasonable to expect warm instruction/data caches and TLBs, +// but a perfect record of which branches will be taken is unrealistic. +// Unless the application also repeatedly invokes the measured function with +// the same parameter, the benchmark is measuring something very different - +// a best-case result, almost as if the parameter were made a compile-time +// constant. This may lead to erroneous conclusions about branch-heavy +// algorithms outperforming branch-free alternatives. +// +// Our approach differs in three ways. Adding fences to the timer functions +// reduces variability due to instruction reordering, improving the timer +// resolution to about 40 CPU cycles. However, shorter functions must still +// be invoked repeatedly. For more realistic branch prediction performance, +// we vary the input parameter according to a user-specified distribution. +// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the +// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the +// central tendency of the measurement samples with the "half sample mode", +// which is more robust to outliers and skewed data than the mean or median. + +#include +#include + +#include "hwy/highway_export.h" + +// Enables sanity checks that verify correct operation at the cost of +// longer benchmark runs. +#ifndef NANOBENCHMARK_ENABLE_CHECKS +#define NANOBENCHMARK_ENABLE_CHECKS 0 +#endif + +#define NANOBENCHMARK_CHECK_ALWAYS(condition) \ + while (!(condition)) { \ + fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \ + abort(); \ + } + +#if NANOBENCHMARK_ENABLE_CHECKS +#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition) +#else +#define NANOBENCHMARK_CHECK(condition) +#endif + +namespace hwy { + +namespace platform { + +// Returns tick rate, useful for converting measurements to seconds. Invariant +// means the tick counter frequency is independent of CPU throttling or sleep. +// This call may be expensive, callers should cache the result. +HWY_DLLEXPORT double InvariantTicksPerSecond(); + +// Returns current timestamp [in seconds] relative to an unspecified origin. +// Features: monotonic (no negative elapsed time), steady (unaffected by system +// time changes), high-resolution (on the order of microseconds). +HWY_DLLEXPORT double Now(); + +// Returns ticks elapsed in back to back timer calls, i.e. a function of the +// timer resolution (minimum measurable difference) and overhead. +// This call is expensive, callers should cache the result. +HWY_DLLEXPORT uint64_t TimerResolution(); + +} // namespace platform + +// Returns 1, but without the compiler knowing what the value is. This prevents +// optimizing out code. +HWY_DLLEXPORT int Unpredictable1(); + +// Input influencing the function being measured (e.g. number of bytes to copy). +using FuncInput = size_t; + +// "Proof of work" returned by Func to ensure the compiler does not elide it. +using FuncOutput = uint64_t; + +// Function to measure: either 1) a captureless lambda or function with two +// arguments or 2) a lambda with capture, in which case the first argument +// is reserved for use by MeasureClosure. +using Func = FuncOutput (*)(const void*, FuncInput); + +// Internal parameters that determine precision/resolution/measuring time. +struct Params { + // For measuring timer overhead/resolution. Used in a nested loop => + // quadratic time, acceptable because we know timer overhead is "low". + // constexpr because this is used to define array bounds. + static constexpr size_t kTimerSamples = 256; + + // Best-case precision, expressed as a divisor of the timer resolution. + // Larger => more calls to Func and higher precision. + size_t precision_divisor = 1024; + + // Ratio between full and subset input distribution sizes. Cannot be less + // than 2; larger values increase measurement time but more faithfully + // model the given input distribution. + size_t subset_ratio = 2; + + // Together with the estimated Func duration, determines how many times to + // call Func before checking the sample variability. Larger values increase + // measurement time, memory/cache use and precision. + double seconds_per_eval = 4E-3; + + // The minimum number of samples before estimating the central tendency. + size_t min_samples_per_eval = 7; + + // The mode is better than median for estimating the central tendency of + // skewed/fat-tailed distributions, but it requires sufficient samples + // relative to the width of half-ranges. + size_t min_mode_samples = 64; + + // Maximum permissible variability (= median absolute deviation / center). + double target_rel_mad = 0.002; + + // Abort after this many evals without reaching target_rel_mad. This + // prevents infinite loops. + size_t max_evals = 9; + + // Whether to print additional statistics to stdout. + bool verbose = true; +}; + +// Measurement result for each unique input. +struct Result { + FuncInput input; + + // Robust estimate (mode or median) of duration. + float ticks; + + // Measure of variability (median absolute deviation relative to "ticks"). + float variability; +}; + +// Precisely measures the number of ticks elapsed when calling "func" with the +// given inputs, shuffled to ensure realistic branch prediction hit rates. +// +// "func" returns a 'proof of work' to ensure its computations are not elided. +// "arg" is passed to Func, or reserved for internal use by MeasureClosure. +// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to +// "func". The values should be chosen to maximize coverage of "func". This +// represents a distribution, so a value's frequency should reflect its +// probability in the real application. Order does not matter; for example, a +// uniform distribution over [0, 4) could be represented as {3,0,2,1}. +// Returns how many Result were written to "results": one per unique input, or +// zero if the measurement failed (an error message goes to stderr). +HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, + const FuncInput* inputs, const size_t num_inputs, + Result* results, const Params& p = Params()); + +// Calls operator() of the given closure (lambda function). +template +static FuncOutput CallClosure(const Closure* f, const FuncInput input) { + return (*f)(input); +} + +// Same as Measure, except "closure" is typically a lambda function of +// FuncInput -> FuncOutput with a capture list. +template +static inline size_t MeasureClosure(const Closure& closure, + const FuncInput* inputs, + const size_t num_inputs, Result* results, + const Params& p = Params()) { + return Measure(reinterpret_cast(&CallClosure), + reinterpret_cast(&closure), inputs, num_inputs, + results, p); +} + +} // namespace hwy + +#endif // HIGHWAY_HWY_NANOBENCHMARK_H_ diff --git a/hwy/nanobenchmark_test.cc b/hwy/nanobenchmark_test.cc new file mode 100644 index 0000000..0d153a1 --- /dev/null +++ b/hwy/nanobenchmark_test.cc @@ -0,0 +1,94 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/nanobenchmark.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include +#include + +#include + +#include "hwy/tests/test_util-inl.h" + +namespace hwy { +namespace { + +// Governs duration of test; avoid timeout in debug builds. +#if HWY_IS_DEBUG_BUILD +constexpr size_t kMaxEvals = 3; +#else +constexpr size_t kMaxEvals = 4; +#endif + +FuncOutput Div(const void*, FuncInput in) { + // Here we're measuring the throughput because benchmark invocations are + // independent. Any dividend will do; the divisor is nonzero. + return 0xFFFFF / in; +} + +template +void MeasureDiv(const FuncInput (&inputs)[N]) { + printf("Measuring integer division (output on final two lines)\n"); + Result results[N]; + Params params; + params.max_evals = kMaxEvals; + const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params); + for (size_t i = 0; i < num_results; ++i) { + printf("%5" PRIu64 ": %6.2f ticks; MAD=%4.2f%%\n", + static_cast(results[i].input), results[i].ticks, + results[i].variability * 100.0); + } +} + +std::mt19937 rng; + +// A function whose runtime depends on rng. +FuncOutput Random(const void* /*arg*/, FuncInput in) { + const size_t r = rng() & 0xF; + FuncOutput ret = static_cast(in); + for (size_t i = 0; i < r; ++i) { + ret /= ((rng() & 1) + 2); + } + return ret; +} + +// Ensure the measured variability is high. +template +void MeasureRandom(const FuncInput (&inputs)[N]) { + Result results[N]; + Params p; + p.max_evals = kMaxEvals; + p.verbose = false; + const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p); + for (size_t i = 0; i < num_results; ++i) { + NANOBENCHMARK_CHECK(results[i].variability > 1E-3); + } +} + +TEST(NanobenchmarkTest, RunAll) { + const int unpredictable = Unpredictable1(); // == 1, unknown to compiler. + static const FuncInput inputs[] = {static_cast(unpredictable) + 2, + static_cast(unpredictable + 9)}; + + MeasureDiv(inputs); + MeasureRandom(inputs); +} + +} // namespace +} // namespace hwy diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h new file mode 100644 index 0000000..f85fcf8 --- /dev/null +++ b/hwy/ops/arm_neon-inl.h @@ -0,0 +1,6664 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 128-bit ARM64 NEON vectors and operations. +// External include guard in highway.h - see comment there. + +// ARM NEON intrinsics are documented at: +// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] + +#include +#include + +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); + +// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with +// the same target attribute as our code, see #834. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") +#include +HWY_DIAGNOSTICS(pop) + +// Must come after arm_neon.h. +namespace hwy { +namespace HWY_NAMESPACE { + +namespace detail { // for code folding and Raw128 + +// Macros used to define single and double function calls for multiple types +// for full and half vectors. These macros are undefined at the end of the file. + +// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function. +#define HWY_NEON_BUILD_TPL_1 +#define HWY_NEON_BUILD_TPL_2 +#define HWY_NEON_BUILD_TPL_3 + +// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can +// extend it to int32x4x2_t packs. +#define HWY_NEON_BUILD_RET_1(type, size) Vec128 +#define HWY_NEON_BUILD_RET_2(type, size) Vec128 +#define HWY_NEON_BUILD_RET_3(type, size) Vec128 + +// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives. +#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128 a +#define HWY_NEON_BUILD_PARAM_2(type, size) \ + const Vec128 a, const Vec128 b +#define HWY_NEON_BUILD_PARAM_3(type, size) \ + const Vec128 a, const Vec128 b, \ + const Vec128 c + +// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying +// function. +#define HWY_NEON_BUILD_ARG_1 a.raw +#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw +#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw + +// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after +// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on +// itself like with some of the library "functions" such as vshlq_u8. For +// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as +// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed. +// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro +// expects two arguments. +#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__) + +// Main macro definition that defines a single function for the given type and +// size of vector, using the underlying (prefix##infix##suffix) function and +// the template, return type, parameters and arguments defined by the "args" +// parameters passed here (see HWY_NEON_BUILD_* macros defined before). +#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ + HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \ + HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \ + name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \ + return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \ + HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \ + } + +// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function +// called "name" using the set of neon functions starting with the given +// "prefix" for all the variants of certain types, as specified next to each +// macro. For example, the prefix "vsub" can be used to define the operator- +// using args=2. + +// uint8_t +#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \ + HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args) + +// int8_t +#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \ + HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args) + +// uint16_t +#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \ + HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args) + +// int16_t +#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ + HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \ + HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \ + HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args) + +// uint32_t +#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ + HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \ + HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args) + +// int32_t +#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ + HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \ + HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args) + +// uint64_t +#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) + +// int64_t +#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \ + HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) + +// float +#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \ + HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \ + HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args) + +// double +#if HWY_ARCH_ARM_A64 +#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \ + HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args) +#else +#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) +#endif + +// float and double + +#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) + +// Helper macros to define for more than one type. +// uint8_t, uint16_t and uint32_t +#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) + +// int8_t, int16_t and int32_t +#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) + +// uint8_t, uint16_t, uint32_t and uint64_t +#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) + +// int8_t, int16_t, int32_t and int64_t +#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) + +// All int*_t and uint*_t up to 64 +#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) + +// All previous types. +#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) + +#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) + +// Emulation of some intrinsics on armv7. +#if HWY_ARCH_ARM_V7 +#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] +#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] +#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] +#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0] +#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0] +#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0] +#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0] +#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0] +#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0] +#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0] +#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0] +#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0] +#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0] +#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0] +#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1] +#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1] +#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1] +#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1] +#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1] +#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1] +#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1] +#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1] +#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1] +#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1] +#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1] +#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1] +#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1] +#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1] +#define vzip1_s8(x, y) vzip_s8(x, y).val[0] +#define vzip1_u8(x, y) vzip_u8(x, y).val[0] +#define vzip1_s16(x, y) vzip_s16(x, y).val[0] +#define vzip1_u16(x, y) vzip_u16(x, y).val[0] +#define vzip1_f32(x, y) vzip_f32(x, y).val[0] +#define vzip1_u32(x, y) vzip_u32(x, y).val[0] +#define vzip1_s32(x, y) vzip_s32(x, y).val[0] +#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0] +#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0] +#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0] +#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0] +#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0] +#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0] +#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0] +#define vzip2_s8(x, y) vzip_s8(x, y).val[1] +#define vzip2_u8(x, y) vzip_u8(x, y).val[1] +#define vzip2_s16(x, y) vzip_s16(x, y).val[1] +#define vzip2_u16(x, y) vzip_u16(x, y).val[1] +#define vzip2_s32(x, y) vzip_s32(x, y).val[1] +#define vzip2_u32(x, y) vzip_u32(x, y).val[1] +#define vzip2_f32(x, y) vzip_f32(x, y).val[1] +#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1] +#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1] +#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1] +#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1] +#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1] +#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1] +#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1] +#endif + +// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 overloads +// for all vector types, even those (bfloat16_t) where the underlying vector is +// the same as others (uint16_t). +template +struct Tuple2; +template +struct Tuple3; +template +struct Tuple4; + +template <> +struct Tuple2 { + uint8x16x2_t raw; +}; +template +struct Tuple2 { + uint8x8x2_t raw; +}; +template <> +struct Tuple2 { + int8x16x2_t raw; +}; +template +struct Tuple2 { + int8x8x2_t raw; +}; +template <> +struct Tuple2 { + uint16x8x2_t raw; +}; +template +struct Tuple2 { + uint16x4x2_t raw; +}; +template <> +struct Tuple2 { + int16x8x2_t raw; +}; +template +struct Tuple2 { + int16x4x2_t raw; +}; +template <> +struct Tuple2 { + uint32x4x2_t raw; +}; +template +struct Tuple2 { + uint32x2x2_t raw; +}; +template <> +struct Tuple2 { + int32x4x2_t raw; +}; +template +struct Tuple2 { + int32x2x2_t raw; +}; +template <> +struct Tuple2 { + uint64x2x2_t raw; +}; +template +struct Tuple2 { + uint64x1x2_t raw; +}; +template <> +struct Tuple2 { + int64x2x2_t raw; +}; +template +struct Tuple2 { + int64x1x2_t raw; +}; + +template <> +struct Tuple2 { + uint16x8x2_t raw; +}; +template +struct Tuple2 { + uint16x4x2_t raw; +}; +template <> +struct Tuple2 { + uint16x8x2_t raw; +}; +template +struct Tuple2 { + uint16x4x2_t raw; +}; + +template <> +struct Tuple2 { + float32x4x2_t raw; +}; +template +struct Tuple2 { + float32x2x2_t raw; +}; +#if HWY_ARCH_ARM_A64 +template <> +struct Tuple2 { + float64x2x2_t raw; +}; +template +struct Tuple2 { + float64x1x2_t raw; +}; +#endif // HWY_ARCH_ARM_A64 + +template <> +struct Tuple3 { + uint8x16x3_t raw; +}; +template +struct Tuple3 { + uint8x8x3_t raw; +}; +template <> +struct Tuple3 { + int8x16x3_t raw; +}; +template +struct Tuple3 { + int8x8x3_t raw; +}; +template <> +struct Tuple3 { + uint16x8x3_t raw; +}; +template +struct Tuple3 { + uint16x4x3_t raw; +}; +template <> +struct Tuple3 { + int16x8x3_t raw; +}; +template +struct Tuple3 { + int16x4x3_t raw; +}; +template <> +struct Tuple3 { + uint32x4x3_t raw; +}; +template +struct Tuple3 { + uint32x2x3_t raw; +}; +template <> +struct Tuple3 { + int32x4x3_t raw; +}; +template +struct Tuple3 { + int32x2x3_t raw; +}; +template <> +struct Tuple3 { + uint64x2x3_t raw; +}; +template +struct Tuple3 { + uint64x1x3_t raw; +}; +template <> +struct Tuple3 { + int64x2x3_t raw; +}; +template +struct Tuple3 { + int64x1x3_t raw; +}; + +template <> +struct Tuple3 { + uint16x8x3_t raw; +}; +template +struct Tuple3 { + uint16x4x3_t raw; +}; +template <> +struct Tuple3 { + uint16x8x3_t raw; +}; +template +struct Tuple3 { + uint16x4x3_t raw; +}; + +template <> +struct Tuple3 { + float32x4x3_t raw; +}; +template +struct Tuple3 { + float32x2x3_t raw; +}; +#if HWY_ARCH_ARM_A64 +template <> +struct Tuple3 { + float64x2x3_t raw; +}; +template +struct Tuple3 { + float64x1x3_t raw; +}; +#endif // HWY_ARCH_ARM_A64 + +template <> +struct Tuple4 { + uint8x16x4_t raw; +}; +template +struct Tuple4 { + uint8x8x4_t raw; +}; +template <> +struct Tuple4 { + int8x16x4_t raw; +}; +template +struct Tuple4 { + int8x8x4_t raw; +}; +template <> +struct Tuple4 { + uint16x8x4_t raw; +}; +template +struct Tuple4 { + uint16x4x4_t raw; +}; +template <> +struct Tuple4 { + int16x8x4_t raw; +}; +template +struct Tuple4 { + int16x4x4_t raw; +}; +template <> +struct Tuple4 { + uint32x4x4_t raw; +}; +template +struct Tuple4 { + uint32x2x4_t raw; +}; +template <> +struct Tuple4 { + int32x4x4_t raw; +}; +template +struct Tuple4 { + int32x2x4_t raw; +}; +template <> +struct Tuple4 { + uint64x2x4_t raw; +}; +template +struct Tuple4 { + uint64x1x4_t raw; +}; +template <> +struct Tuple4 { + int64x2x4_t raw; +}; +template +struct Tuple4 { + int64x1x4_t raw; +}; + +template <> +struct Tuple4 { + uint16x8x4_t raw; +}; +template +struct Tuple4 { + uint16x4x4_t raw; +}; +template <> +struct Tuple4 { + uint16x8x4_t raw; +}; +template +struct Tuple4 { + uint16x4x4_t raw; +}; + +template <> +struct Tuple4 { + float32x4x4_t raw; +}; +template +struct Tuple4 { + float32x2x4_t raw; +}; +#if HWY_ARCH_ARM_A64 +template <> +struct Tuple4 { + float64x2x4_t raw; +}; +template +struct Tuple4 { + float64x1x4_t raw; +}; +#endif // HWY_ARCH_ARM_A64 + +template +struct Raw128; + +// 128 +template <> +struct Raw128 { + using type = uint8x16_t; +}; + +template <> +struct Raw128 { + using type = uint16x8_t; +}; + +template <> +struct Raw128 { + using type = uint32x4_t; +}; + +template <> +struct Raw128 { + using type = uint64x2_t; +}; + +template <> +struct Raw128 { + using type = int8x16_t; +}; + +template <> +struct Raw128 { + using type = int16x8_t; +}; + +template <> +struct Raw128 { + using type = int32x4_t; +}; + +template <> +struct Raw128 { + using type = int64x2_t; +}; + +template <> +struct Raw128 { + using type = uint16x8_t; +}; + +template <> +struct Raw128 { + using type = uint16x8_t; +}; + +template <> +struct Raw128 { + using type = float32x4_t; +}; + +#if HWY_ARCH_ARM_A64 +template <> +struct Raw128 { + using type = float64x2_t; +}; +#endif + +// 64 +template <> +struct Raw128 { + using type = uint8x8_t; +}; + +template <> +struct Raw128 { + using type = uint16x4_t; +}; + +template <> +struct Raw128 { + using type = uint32x2_t; +}; + +template <> +struct Raw128 { + using type = uint64x1_t; +}; + +template <> +struct Raw128 { + using type = int8x8_t; +}; + +template <> +struct Raw128 { + using type = int16x4_t; +}; + +template <> +struct Raw128 { + using type = int32x2_t; +}; + +template <> +struct Raw128 { + using type = int64x1_t; +}; + +template <> +struct Raw128 { + using type = uint16x4_t; +}; + +template <> +struct Raw128 { + using type = uint16x4_t; +}; + +template <> +struct Raw128 { + using type = float32x2_t; +}; + +#if HWY_ARCH_ARM_A64 +template <> +struct Raw128 { + using type = float64x1_t; +}; +#endif + +// 32 (same as 64) +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +// 16 (same as 64) +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +// 8 (same as 64) +template <> +struct Raw128 : public Raw128 {}; + +template <> +struct Raw128 : public Raw128 {}; + +} // namespace detail + +template +class Vec128 { + using Raw = typename detail::Raw128::type; + + public: + HWY_INLINE Vec128() {} + Vec128(const Vec128&) = default; + Vec128& operator=(const Vec128&) = default; + HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {} + + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +template +using Vec64 = Vec128; + +template +using Vec32 = Vec128; + +// FF..FF or 0. +template +class Mask128 { + // ARM C Language Extensions return and expect unsigned type. + using Raw = typename detail::Raw128, N>::type; + + public: + HWY_INLINE Mask128() {} + Mask128(const Mask128&) = default; + Mask128& operator=(const Mask128&) = default; + HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {} + + Raw raw; +}; + +template +using Mask64 = Mask128; + +namespace detail { + +// Deduce Simd from Vec128 +struct DeduceD { + template + Simd operator()(Vec128) const { + return Simd(); + } +}; + +} // namespace detail + +template +using DFromV = decltype(detail::DeduceD()(V())); + +template +using TFromV = TFromD>; + +// ------------------------------ BitCast + +namespace detail { + +// Converts from Vec128 to Vec128 using the +// vreinterpret*_u8_*() set of functions. +#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 +#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \ + Vec128 +#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128 v +#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw + +// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined. +template +HWY_INLINE Vec128 BitCastToByte(Vec128 v) { + return v; +} + +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, + HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) +HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) + +// Special cases for [b]float16_t, which have the same Raw as uint16_t. +template +HWY_INLINE Vec128 BitCastToByte(Vec128 v) { + return BitCastToByte(Vec128(v.raw)); +} +template +HWY_INLINE Vec128 BitCastToByte(Vec128 v) { + return BitCastToByte(Vec128(v.raw)); +} + +#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 +#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8 +#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8 +#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 + +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return v; +} + +// 64-bit or less: + +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(vreinterpret_s8_u8(v.raw)); +} +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(vreinterpret_u16_u8(v.raw)); +} +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(vreinterpret_s16_u8(v.raw)); +} +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(vreinterpret_u32_u8(v.raw)); +} +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(vreinterpret_s32_u8(v.raw)); +} +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(vreinterpret_f32_u8(v.raw)); +} +HWY_INLINE Vec64 BitCastFromByte(Full64 /* tag */, + Vec128 v) { + return Vec64(vreinterpret_u64_u8(v.raw)); +} +HWY_INLINE Vec64 BitCastFromByte(Full64 /* tag */, + Vec128 v) { + return Vec64(vreinterpret_s64_u8(v.raw)); +} +#if HWY_ARCH_ARM_A64 +HWY_INLINE Vec64 BitCastFromByte(Full64 /* tag */, + Vec128 v) { + return Vec64(vreinterpret_f64_u8(v.raw)); +} +#endif + +// 128-bit full: + +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_s8_u8(v.raw)); +} +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_u16_u8(v.raw)); +} +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_s16_u8(v.raw)); +} +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_u32_u8(v.raw)); +} +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_s32_u8(v.raw)); +} +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_f32_u8(v.raw)); +} +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_u64_u8(v.raw)); +} +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_s64_u8(v.raw)); +} + +#if HWY_ARCH_ARM_A64 +HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_f64_u8(v.raw)); +} +#endif + +// Special cases for [b]float16_t, which have the same Raw as uint16_t. +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(BitCastFromByte(Simd(), v).raw); +} +template +HWY_INLINE Vec128 BitCastFromByte( + Simd /* tag */, Vec128 v) { + return Vec128(BitCastFromByte(Simd(), v).raw); +} + +} // namespace detail + +template +HWY_API Vec128 BitCast(Simd d, + Vec128 v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Set + +// Returns a vector with all lanes set to "t". +#define HWY_NEON_BUILD_TPL_HWY_SET1 +#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128 +#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \ + Simd /* tag */, const type##_t t +#define HWY_NEON_BUILD_ARG_HWY_SET1 t + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1) + +#undef HWY_NEON_BUILD_TPL_HWY_SET1 +#undef HWY_NEON_BUILD_RET_HWY_SET1 +#undef HWY_NEON_BUILD_PARAM_HWY_SET1 +#undef HWY_NEON_BUILD_ARG_HWY_SET1 + +// Returns an all-zero vector. +template +HWY_API Vec128 Zero(Simd d) { + return Set(d, 0); +} + +template +HWY_API Vec128 Zero(Simd /* tag */) { + return Vec128(Zero(Simd()).raw); +} + +template +using VFromD = decltype(Zero(D())); + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") +#if HWY_COMPILER_GCC_ACTUAL + HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") +#endif + +// Returns a vector with uninitialized elements. +template +HWY_API Vec128 Undefined(Simd /*d*/) { + typename detail::Raw128::type a; + return Vec128(a); +} + +HWY_DIAGNOSTICS(pop) + +// Returns a vector with lane i=[0, N) set to "first" + i. +template +Vec128 Iota(const Simd d, const T2 first) { + HWY_ALIGN T lanes[16 / sizeof(T)]; + for (size_t i = 0; i < 16 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + +// ------------------------------ GetLane + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_GET template +#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t +#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128 v +#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET) + +#undef HWY_NEON_BUILD_TPL_HWY_GET +#undef HWY_NEON_BUILD_RET_HWY_GET +#undef HWY_NEON_BUILD_PARAM_HWY_GET +#undef HWY_NEON_BUILD_ARG_HWY_GET + +} // namespace detail + +template +HWY_API TFromV GetLane(const V v) { + return detail::GetLane<0>(v); +} + +// ------------------------------ ExtractLane + +// Requires one overload per vector length because GetLane<3> is a compile error +// if v is a uint32x2_t. +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return detail::GetLane<0>(v); +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + } + } +#endif + alignas(16) T lanes[2]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + case 2: + return detail::GetLane<2>(v); + case 3: + return detail::GetLane<3>(v); + } + } +#endif + alignas(16) T lanes[4]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + case 2: + return detail::GetLane<2>(v); + case 3: + return detail::GetLane<3>(v); + case 4: + return detail::GetLane<4>(v); + case 5: + return detail::GetLane<5>(v); + case 6: + return detail::GetLane<6>(v); + case 7: + return detail::GetLane<7>(v); + } + } +#endif + alignas(16) T lanes[8]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::GetLane<0>(v); + case 1: + return detail::GetLane<1>(v); + case 2: + return detail::GetLane<2>(v); + case 3: + return detail::GetLane<3>(v); + case 4: + return detail::GetLane<4>(v); + case 5: + return detail::GetLane<5>(v); + case 6: + return detail::GetLane<6>(v); + case 7: + return detail::GetLane<7>(v); + case 8: + return detail::GetLane<8>(v); + case 9: + return detail::GetLane<9>(v); + case 10: + return detail::GetLane<10>(v); + case 11: + return detail::GetLane<11>(v); + case 12: + return detail::GetLane<12>(v); + case 13: + return detail::GetLane<13>(v); + case 14: + return detail::GetLane<14>(v); + case 15: + return detail::GetLane<15>(v); + } + } +#endif + alignas(16) T lanes[16]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_INSERT template +#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128 +#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \ + Vec128 v, type##_t t +#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT) + +#undef HWY_NEON_BUILD_TPL_HWY_INSERT +#undef HWY_NEON_BUILD_RET_HWY_INSERT +#undef HWY_NEON_BUILD_PARAM_HWY_INSERT +#undef HWY_NEON_BUILD_ARG_HWY_INSERT + +} // namespace detail + +// Requires one overload per vector length because InsertLane<3> may be a +// compile error. + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + return Set(DFromV(), t); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[2]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[4]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[8]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + case 8: + return detail::InsertLane<8>(v, t); + case 9: + return detail::InsertLane<9>(v, t); + case 10: + return detail::InsertLane<10>(v, t); + case 11: + return detail::InsertLane<11>(v, t); + case 12: + return detail::InsertLane<12>(v, t); + case 13: + return detail::InsertLane<13>(v, t); + case 14: + return detail::InsertLane<14>(v, t); + case 15: + return detail::InsertLane<15>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[16]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition +HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2) + +// ------------------------------ Subtraction +HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2) + +// ------------------------------ SumsOf8 + +HWY_API Vec128 SumsOf8(const Vec128 v) { + return Vec128(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw)))); +} +HWY_API Vec64 SumsOf8(const Vec64 v) { + return Vec64(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw)))); +} + +// ------------------------------ SaturatedAdd +// Only defined for uint8_t, uint16_t and their signed versions, as in other +// architectures. + +// Returns a + b clamped to the destination range. +HWY_NEON_DEF_FUNCTION_INT_8(SaturatedAdd, vqadd, _, 2) +HWY_NEON_DEF_FUNCTION_INT_16(SaturatedAdd, vqadd, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedAdd, vqadd, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedAdd, vqadd, _, 2) + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. +HWY_NEON_DEF_FUNCTION_INT_8(SaturatedSub, vqsub, _, 2) +HWY_NEON_DEF_FUNCTION_INT_16(SaturatedSub, vqsub, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedSub, vqsub, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedSub, vqsub, _, 2) + +// Not part of API, used in implementation. +namespace detail { +HWY_NEON_DEF_FUNCTION_UINT_32(SaturatedSub, vqsub, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_64(SaturatedSub, vqsub, _, 2) +HWY_NEON_DEF_FUNCTION_INT_32(SaturatedSub, vqsub, _, 2) +HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSub, vqsub, _, 2) +} // namespace detail + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 +HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) + +// ------------------------------ Neg + +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below + +HWY_API Vec64 Neg(const Vec64 v) { +#if HWY_ARCH_ARM_A64 + return Vec64(vneg_s64(v.raw)); +#else + return Zero(Full64()) - v; +#endif +} + +HWY_API Vec128 Neg(const Vec128 v) { +#if HWY_ARCH_ARM_A64 + return Vec128(vnegq_s64(v.raw)); +#else + return Zero(Full128()) - v; +#endif +} + +// ------------------------------ ShiftLeft + +// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported). +#pragma push_macro("HWY_NEON_DEF_FUNCTION") +#undef HWY_NEON_DEF_FUNCTION +#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ + template \ + HWY_API Vec128 name(const Vec128 v) { \ + return kBits == 0 ? v \ + : Vec128(HWY_NEON_EVAL( \ + prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \ + } + +HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored) + +HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored) +HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored) + +#pragma pop_macro("HWY_NEON_DEF_FUNCTION") + +// ------------------------------ RotateRight (ShiftRight, Or) + +template +HWY_API Vec128 RotateRight(const Vec128 v) { + static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +} + +template +HWY_API Vec128 RotateRight(const Vec128 v) { + static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +} + +// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a +// mechanism for checking for extensions to ARMv8. + +// ------------------------------ Shl + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); +} +template +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); +} + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); +} +template +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw))); +} + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw))); +} +template +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw))); +} + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw))); +} +HWY_API Vec64 operator<<(const Vec64 v, + const Vec64 bits) { + return Vec64(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); +} + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s8(v.raw, bits.raw)); +} +template +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s8(v.raw, bits.raw)); +} + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s16(v.raw, bits.raw)); +} +template +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s16(v.raw, bits.raw)); +} + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s32(v.raw, bits.raw)); +} +template +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s32(v.raw, bits.raw)); +} + +HWY_API Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s64(v.raw, bits.raw)); +} +HWY_API Vec64 operator<<(const Vec64 v, + const Vec64 bits) { + return Vec64(vshl_s64(v.raw, bits.raw)); +} + +// ------------------------------ Shr (Neg) + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int8x16_t neg_bits = Neg(BitCast(Full128(), bits)).raw; + return Vec128(vshlq_u8(v.raw, neg_bits)); +} +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int8x8_t neg_bits = Neg(BitCast(Simd(), bits)).raw; + return Vec128(vshl_u8(v.raw, neg_bits)); +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int16x8_t neg_bits = Neg(BitCast(Full128(), bits)).raw; + return Vec128(vshlq_u16(v.raw, neg_bits)); +} +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int16x4_t neg_bits = Neg(BitCast(Simd(), bits)).raw; + return Vec128(vshl_u16(v.raw, neg_bits)); +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int32x4_t neg_bits = Neg(BitCast(Full128(), bits)).raw; + return Vec128(vshlq_u32(v.raw, neg_bits)); +} +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int32x2_t neg_bits = Neg(BitCast(Simd(), bits)).raw; + return Vec128(vshl_u32(v.raw, neg_bits)); +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int64x2_t neg_bits = Neg(BitCast(Full128(), bits)).raw; + return Vec128(vshlq_u64(v.raw, neg_bits)); +} +HWY_API Vec64 operator>>(const Vec64 v, + const Vec64 bits) { + const int64x1_t neg_bits = Neg(BitCast(Full64(), bits)).raw; + return Vec64(vshl_u64(v.raw, neg_bits)); +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s8(v.raw, Neg(bits).raw)); +} +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s8(v.raw, Neg(bits).raw)); +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s16(v.raw, Neg(bits).raw)); +} +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s16(v.raw, Neg(bits).raw)); +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s32(v.raw, Neg(bits).raw)); +} +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s32(v.raw, Neg(bits).raw)); +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s64(v.raw, Neg(bits).raw)); +} +HWY_API Vec64 operator>>(const Vec64 v, + const Vec64 bits) { + return Vec64(vshl_s64(v.raw, Neg(bits).raw)); +} + +// ------------------------------ ShiftLeftSame (Shl) + +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, int bits) { + return v << Set(Simd(), static_cast(bits)); +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, int bits) { + return v >> Set(Simd(), static_cast(bits)); +} + +// ------------------------------ Integer multiplication + +// Unsigned +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmulq_u16(a.raw, b.raw)); +} +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmulq_u32(a.raw, b.raw)); +} + +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmul_u16(a.raw, b.raw)); +} +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmul_u32(a.raw, b.raw)); +} + +// Signed +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmulq_s16(a.raw, b.raw)); +} +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmulq_s32(a.raw, b.raw)); +} + +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmul_s16(a.raw, b.raw)); +} +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128(vmul_s32(a.raw, b.raw)); +} + +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); +#if HWY_ARCH_ARM_A64 + int32x4_t rhi = vmull_high_s16(a.raw, b.raw); +#else + int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); +#endif + return Vec128( + vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi))); +} +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); +#if HWY_ARCH_ARM_A64 + uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); +#else + uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); +#endif + return Vec128( + vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi))); +} + +template +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw)); + return Vec128(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo))); +} +template +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw)); + return Vec128(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo))); +} + +HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { + return Vec128(vqrdmulhq_s16(a.raw, b.raw)); +} +template +HWY_API Vec128 MulFixedPoint15(Vec128 a, + Vec128 b) { + return Vec128(vqrdmulh_s16(a.raw, b.raw)); +} + +// ------------------------------ Floating-point mul / div + +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2) + +// Approximate reciprocal +HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { + return Vec128(vrecpeq_f32(v.raw)); +} +template +HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { + return Vec128(vrecpe_f32(v.raw)); +} + +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) +#else +// Not defined on armv7: approximate +namespace detail { + +HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( + const Vec128 recip, const Vec128 divisor) { + return Vec128(vrecpsq_f32(recip.raw, divisor.raw)); +} +template +HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( + const Vec128 recip, Vec128 divisor) { + return Vec128(vrecps_f32(recip.raw, divisor.raw)); +} + +} // namespace detail + +template +HWY_API Vec128 operator/(const Vec128 a, + const Vec128 b) { + auto x = ApproximateReciprocal(b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + return a * x; +} +#endif + +// ------------------------------ Absolute value of difference. + +HWY_API Vec128 AbsDiff(const Vec128 a, const Vec128 b) { + return Vec128(vabdq_f32(a.raw, b.raw)); +} +template +HWY_API Vec128 AbsDiff(const Vec128 a, + const Vec128 b) { + return Vec128(vabd_f32(a.raw, b.raw)); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns add + mul * x +#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 +template +HWY_API Vec128 MulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + return Vec128(vfma_f32(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128 MulAdd(const Vec128 mul, const Vec128 x, + const Vec128 add) { + return Vec128(vfmaq_f32(add.raw, mul.raw, x.raw)); +} +#else +// Emulate FMA for floats. +template +HWY_API Vec128 MulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + return mul * x + add; +} +#endif + +#if HWY_ARCH_ARM_A64 +HWY_API Vec64 MulAdd(const Vec64 mul, const Vec64 x, + const Vec64 add) { + return Vec64(vfma_f64(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128 MulAdd(const Vec128 mul, const Vec128 x, + const Vec128 add) { + return Vec128(vfmaq_f64(add.raw, mul.raw, x.raw)); +} +#endif + +// Returns add - mul * x +#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 +template +HWY_API Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + return Vec128(vfms_f32(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128 NegMulAdd(const Vec128 mul, const Vec128 x, + const Vec128 add) { + return Vec128(vfmsq_f32(add.raw, mul.raw, x.raw)); +} +#else +// Emulate FMA for floats. +template +HWY_API Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + return add - mul * x; +} +#endif + +#if HWY_ARCH_ARM_A64 +HWY_API Vec64 NegMulAdd(const Vec64 mul, const Vec64 x, + const Vec64 add) { + return Vec64(vfms_f64(add.raw, mul.raw, x.raw)); +} +HWY_API Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + return Vec128(vfmsq_f64(add.raw, mul.raw, x.raw)); +} +#endif + +// Returns mul * x - sub +template +HWY_API Vec128 MulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + return MulAdd(mul, x, Neg(sub)); +} + +// Returns -mul * x - sub +template +HWY_API Vec128 NegMulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + return Neg(MulAdd(mul, x, sub)); +} + +#if HWY_ARCH_ARM_A64 +template +HWY_API Vec128 MulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + return MulAdd(mul, x, Neg(sub)); +} +template +HWY_API Vec128 NegMulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + return Neg(MulAdd(mul, x, sub)); +} +#endif + +// ------------------------------ Floating-point square root (IfThenZeroElse) + +// Approximate reciprocal square root +HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { + return Vec128(vrsqrteq_f32(v.raw)); +} +template +HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { + return Vec128(vrsqrte_f32(v.raw)); +} + +// Full precision square root +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) +#else +namespace detail { + +HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, + const Vec128 recip) { + return Vec128(vrsqrtsq_f32(root.raw, recip.raw)); +} +template +HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, + Vec128 recip) { + return Vec128(vrsqrts_f32(root.raw, recip.raw)); +} + +} // namespace detail + +// Not defined on armv7: approximate +template +HWY_API Vec128 Sqrt(const Vec128 v) { + auto recip = ApproximateReciprocalSqrt(v); + + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + + const auto root = v * recip; + return IfThenZeroElse(v == Zero(Simd()), root); +} +#endif + +// ================================================== LOGICAL + +// ------------------------------ Not + +// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. +template +HWY_API Vec128 Not(const Vec128 v) { + const Full128 d; + const Repartition d8; + return BitCast(d, Vec128(vmvnq_u8(BitCast(d8, v).raw))); +} +template +HWY_API Vec128 Not(const Vec128 v) { + const Simd d; + const Repartition d8; + using V8 = decltype(Zero(d8)); + return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); +} + +// ------------------------------ And +HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2) + +// Uses the u32/64 defined above. +template +HWY_API Vec128 And(const Vec128 a, const Vec128 b) { + const DFromV d; + const RebindToUnsigned du; + return BitCast(d, BitCast(du, a) & BitCast(du, b)); +} + +// ------------------------------ AndNot + +namespace detail { +// reversed_andnot returns a & ~b. +HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2) +} // namespace detail + +// Returns ~not_mask & mask. +template +HWY_API Vec128 AndNot(const Vec128 not_mask, + const Vec128 mask) { + return detail::reversed_andnot(mask, not_mask); +} + +// Uses the u32/64 defined above. +template +HWY_API Vec128 AndNot(const Vec128 not_mask, + const Vec128 mask) { + const DFromV d; + const RebindToUnsigned du; + VFromD ret = + detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask)); + return BitCast(d, ret); +} + +// ------------------------------ Or + +HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2) + +// Uses the u32/64 defined above. +template +HWY_API Vec128 Or(const Vec128 a, const Vec128 b) { + const DFromV d; + const RebindToUnsigned du; + return BitCast(d, BitCast(du, a) | BitCast(du, b)); +} + +// ------------------------------ Xor + +HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2) + +// Uses the u32/64 defined above. +template +HWY_API Vec128 Xor(const Vec128 a, const Vec128 b) { + const DFromV d; + const RebindToUnsigned du; + return BitCast(d, BitCast(du, a) ^ BitCast(du, b)); +} + +// ------------------------------ Or3 + +template +HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd + +template +HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse + +template +HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, + Vec128 no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template +HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { + return And(a, b); +} + +template +HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { + return Or(a, b); +} + +template +HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { + return Xor(a, b); +} + +// ------------------------------ PopulationCount + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, Vec128 v) { + const Full128 d8; + return Vec128(vcntq_u8(BitCast(d8, v).raw)); +} +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, + Vec128 v) { + const Simd d8; + return Vec128(vcnt_u8(BitCast(d8, v).raw)); +} + +// ARM lacks popcount for lane sizes > 1, so take pairwise sums of the bytes. +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, Vec128 v) { + const Full128 d8; + const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); + return Vec128(vpaddlq_u8(bytes)); +} +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Repartition> d8; + const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); + return Vec128(vpaddl_u8(bytes)); +} + +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, Vec128 v) { + const Full128 d8; + const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); + return Vec128(vpaddlq_u16(vpaddlq_u8(bytes))); +} +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, + Vec128 v) { + const Repartition> d8; + const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); + return Vec128(vpaddl_u16(vpaddl_u8(bytes))); +} + +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, Vec128 v) { + const Full128 d8; + const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); + return Vec128(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes)))); +} +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, + Vec128 v) { + const Repartition> d8; + const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); + return Vec128(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes)))); +} + +} // namespace detail + +template +HWY_API Vec128 PopulationCount(Vec128 v) { + return detail::PopulationCount(hwy::SizeTag(), v); +} + +// ================================================== SIGN + +// ------------------------------ Abs + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_s8(v.raw)); +} +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_s16(v.raw)); +} +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_s32(v.raw)); +} +// i64 is implemented after BroadcastSignBit. +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_f32(v.raw)); +} + +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabs_s8(v.raw)); +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabs_s16(v.raw)); +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabs_s32(v.raw)); +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabs_f32(v.raw)); +} + +#if HWY_ARCH_ARM_A64 +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_f64(v.raw)); +} + +HWY_API Vec64 Abs(const Vec64 v) { + return Vec64(vabs_f64(v.raw)); +} +#endif + +// ------------------------------ CopySign + +template +HWY_API Vec128 CopySign(const Vec128 magn, + const Vec128 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + const auto msb = SignBit(Simd()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template +HWY_API Vec128 CopySignToAbs(const Vec128 abs, + const Vec128 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(Simd()), sign)); +} + +// ------------------------------ BroadcastSignBit + +template +HWY_API Vec128 BroadcastSignBit(const Vec128 v) { + return ShiftRight(v); +} + +// ================================================== MASK + +// ------------------------------ To/from vector + +// Mask and Vec have the same representation (true = FF..FF). +template +HWY_API Mask128 MaskFromVec(const Vec128 v) { + const Simd, N, 0> du; + return Mask128(BitCast(du, v).raw); +} + +template +HWY_API Vec128 VecFromMask(Simd d, const Mask128 v) { + return BitCast(d, Vec128, N>(v.raw)); +} + +// ------------------------------ RebindMask + +template +HWY_API Mask128 RebindMask(Simd dto, Mask128 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return MaskFromVec(BitCast(dto, VecFromMask(Simd(), m))); +} + +// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a. + +#define HWY_NEON_BUILD_TPL_HWY_IF +#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128 +#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ + const Mask128 mask, const Vec128 yes, \ + const Vec128 no +#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF) + +#undef HWY_NEON_BUILD_TPL_HWY_IF +#undef HWY_NEON_BUILD_RET_HWY_IF +#undef HWY_NEON_BUILD_PARAM_HWY_IF +#undef HWY_NEON_BUILD_ARG_HWY_IF + +// mask ? yes : 0 +template +HWY_API Vec128 IfThenElseZero(const Mask128 mask, + const Vec128 yes) { + return yes & VecFromMask(Simd(), mask); +} + +// mask ? 0 : no +template +HWY_API Vec128 IfThenZeroElse(const Mask128 mask, + const Vec128 no) { + return AndNot(VecFromMask(Simd(), mask), no); +} + +template +HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, + Vec128 no) { + static_assert(IsSigned(), "Only works for signed/float"); + const Simd d; + const RebindToSigned di; + + Mask128 m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); + return IfThenElse(m, yes, no); +} + +template +HWY_API Vec128 ZeroIfNegative(Vec128 v) { + const auto zero = Zero(Simd()); + return Max(zero, v); +} + +// ------------------------------ Mask logical + +template +HWY_API Mask128 Not(const Mask128 m) { + return MaskFromVec(Not(VecFromMask(Simd(), m))); +} + +template +HWY_API Mask128 And(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Or(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ================================================== COMPARE + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +// ------------------------------ Shuffle2301 (for i64 compares) + +// Swap 32-bit halves in 64-bits +HWY_API Vec64 Shuffle2301(const Vec64 v) { + return Vec64(vrev64_u32(v.raw)); +} +HWY_API Vec64 Shuffle2301(const Vec64 v) { + return Vec64(vrev64_s32(v.raw)); +} +HWY_API Vec64 Shuffle2301(const Vec64 v) { + return Vec64(vrev64_f32(v.raw)); +} +HWY_API Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64q_u32(v.raw)); +} +HWY_API Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64q_s32(v.raw)); +} +HWY_API Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64q_f32(v.raw)); +} + +#define HWY_NEON_BUILD_TPL_HWY_COMPARE +#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 +#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ + const Vec128 a, const Vec128 b +#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw + +// ------------------------------ Equality +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) +#else +// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) +#endif + +// ------------------------------ Strict inequality (signed, float) +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE) +#else +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) +#endif +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) + +// ------------------------------ Weak inequality (float) +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) + +#undef HWY_NEON_BUILD_TPL_HWY_COMPARE +#undef HWY_NEON_BUILD_RET_HWY_COMPARE +#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE +#undef HWY_NEON_BUILD_ARG_HWY_COMPARE + +// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq) + +#if HWY_ARCH_ARM_V7 + +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + const Simd d32; + const Simd d64; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); +} + +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + const Simd d32; + const Simd d64; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); +} + +HWY_API Mask128 operator<(const Vec128 a, + const Vec128 b) { + const int64x2_t sub = vqsubq_s64(a.raw, b.raw); + return MaskFromVec(BroadcastSignBit(Vec128(sub))); +} +HWY_API Mask128 operator<(const Vec64 a, + const Vec64 b) { + const int64x1_t sub = vqsub_s64(a.raw, b.raw); + return MaskFromVec(BroadcastSignBit(Vec64(sub))); +} + +template +HWY_API Mask128 operator<(const Vec128 a, + const Vec128 b) { + const DFromV du; + const RebindToSigned di; + const Vec128 msb = AndNot(a, b) | AndNot(a ^ b, a - b); + return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb)))); +} + +#endif + +// ------------------------------ operator!= (operator==) + +// Customize HWY_NEON_DEF_FUNCTION to call 2 functions. +#pragma push_macro("HWY_NEON_DEF_FUNCTION") +#undef HWY_NEON_DEF_FUNCTION +// This cannot have _any_ template argument (in x86_128 we can at least have N +// as an argument), otherwise it is not more specialized than rewritten +// operator== in C++20, leading to compile errors. +#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ + HWY_API Mask128 name(Vec128 a, \ + Vec128 b) { \ + return Not(a == b); \ + } + +HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored) + +#pragma pop_macro("HWY_NEON_DEF_FUNCTION") + +// ------------------------------ Reversed comparisons + +template +HWY_API Mask128 operator>(Vec128 a, Vec128 b) { + return operator<(b, a); +} +template +HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { + return operator<=(b, a); +} + +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask128 FirstN(const Simd d, size_t num) { + const RebindToSigned di; // Signed comparisons are cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); +} + +// ------------------------------ TestBit (Eq) + +#define HWY_NEON_BUILD_TPL_HWY_TESTBIT +#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128 +#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ + Vec128 v, Vec128 bit +#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw + +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) +#else +// No 64-bit versions on armv7 +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) + +template +HWY_API Mask128 TestBit(Vec128 v, + Vec128 bit) { + return (v & bit) == bit; +} +template +HWY_API Mask128 TestBit(Vec128 v, + Vec128 bit) { + return (v & bit) == bit; +} + +#endif +#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT +#undef HWY_NEON_BUILD_RET_HWY_TESTBIT +#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT +#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT + +// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit) +HWY_API Vec128 Abs(const Vec128 v) { +#if HWY_ARCH_ARM_A64 + return Vec128(vabsq_s64(v.raw)); +#else + const auto zero = Zero(Full128()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} +HWY_API Vec64 Abs(const Vec64 v) { +#if HWY_ARCH_ARM_A64 + return Vec64(vabs_s64(v.raw)); +#else + const auto zero = Zero(Full64()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +// ------------------------------ Min (IfThenElse, BroadcastSignBit) + +// Unsigned +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) + +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, b, a); +#else + const DFromV du; + const RebindToSigned di; + return BitCast(du, BitCast(di, a) - BitCast(di, detail::SaturatedSub(a, b))); +#endif +} + +// Signed +HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2) + +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, b, a); +#else + const Vec128 sign = detail::SaturatedSub(a, b); + return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b); +#endif +} + +// Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN. +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2) +#else +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) +#endif + +// ------------------------------ Max (IfThenElse, BroadcastSignBit) + +// Unsigned (no u64) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2) + +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, a, b); +#else + const DFromV du; + const RebindToSigned di; + return BitCast(du, BitCast(di, b) + BitCast(di, detail::SaturatedSub(a, b))); +#endif +} + +// Signed (no i64) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2) + +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, a, b); +#else + const Vec128 sign = detail::SaturatedSub(a, b); + return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a); +#endif +} + +// Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN. +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2) +#else +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) +#endif + +// ================================================== MEMORY + +// ------------------------------ Load 128 + +HWY_API Vec128 LoadU(Full128 /* tag */, + const uint8_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_u8(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const uint16_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_u16(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const uint32_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_u32(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const uint64_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_u64(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const int8_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_s8(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const int16_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_s16(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const int32_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_s32(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const int64_t* HWY_RESTRICT unaligned) { + return Vec128(vld1q_s64(unaligned)); +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const float* HWY_RESTRICT unaligned) { + return Vec128(vld1q_f32(unaligned)); +} +#if HWY_ARCH_ARM_A64 +HWY_API Vec128 LoadU(Full128 /* tag */, + const double* HWY_RESTRICT unaligned) { + return Vec128(vld1q_f64(unaligned)); +} +#endif + +// ------------------------------ Load 64 + +HWY_API Vec64 LoadU(Full64 /* tag */, + const uint8_t* HWY_RESTRICT p) { + return Vec64(vld1_u8(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const uint16_t* HWY_RESTRICT p) { + return Vec64(vld1_u16(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const uint32_t* HWY_RESTRICT p) { + return Vec64(vld1_u32(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const uint64_t* HWY_RESTRICT p) { + return Vec64(vld1_u64(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const int8_t* HWY_RESTRICT p) { + return Vec64(vld1_s8(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const int16_t* HWY_RESTRICT p) { + return Vec64(vld1_s16(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const int32_t* HWY_RESTRICT p) { + return Vec64(vld1_s32(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const int64_t* HWY_RESTRICT p) { + return Vec64(vld1_s64(p)); +} +HWY_API Vec64 LoadU(Full64 /* tag */, + const float* HWY_RESTRICT p) { + return Vec64(vld1_f32(p)); +} +#if HWY_ARCH_ARM_A64 +HWY_API Vec64 LoadU(Full64 /* tag */, + const double* HWY_RESTRICT p) { + return Vec64(vld1_f64(p)); +} +#endif +// ------------------------------ Load 32 + +// Actual 32-bit broadcast load - used to implement the other lane types +// because reinterpret_cast of the pointer leads to incorrect codegen on GCC. +HWY_API Vec32 LoadU(Full32 /*tag*/, + const uint32_t* HWY_RESTRICT p) { + return Vec32(vld1_dup_u32(p)); +} +HWY_API Vec32 LoadU(Full32 /*tag*/, + const int32_t* HWY_RESTRICT p) { + return Vec32(vld1_dup_s32(p)); +} +HWY_API Vec32 LoadU(Full32 /*tag*/, const float* HWY_RESTRICT p) { + return Vec32(vld1_dup_f32(p)); +} + +template +HWY_API Vec32 LoadU(Full32 d, const T* HWY_RESTRICT p) { + const Repartition d32; + uint32_t buf; + CopyBytes<4>(p, &buf); + return BitCast(d, LoadU(d32, &buf)); +} + +// ------------------------------ Load 16 + +// Actual 16-bit broadcast load - used to implement the other lane types +// because reinterpret_cast of the pointer leads to incorrect codegen on GCC. +HWY_API Vec128 LoadU(Simd /*tag*/, + const uint16_t* HWY_RESTRICT p) { + return Vec128(vld1_dup_u16(p)); +} +HWY_API Vec128 LoadU(Simd /*tag*/, + const int16_t* HWY_RESTRICT p) { + return Vec128(vld1_dup_s16(p)); +} + +template +HWY_API Vec128 LoadU(Simd d, const T* HWY_RESTRICT p) { + const Repartition d16; + uint16_t buf; + CopyBytes<2>(p, &buf); + return BitCast(d, LoadU(d16, &buf)); +} + +// ------------------------------ Load 8 + +HWY_API Vec128 LoadU(Simd, + const uint8_t* HWY_RESTRICT p) { + return Vec128(vld1_dup_u8(p)); +} + +HWY_API Vec128 LoadU(Simd, + const int8_t* HWY_RESTRICT p) { + return Vec128(vld1_dup_s8(p)); +} + +// [b]float16_t use the same Raw as uint16_t, so forward to that. +template +HWY_API Vec128 LoadU(Simd d, + const float16_t* HWY_RESTRICT p) { + const RebindToUnsigned du16; + const auto pu16 = reinterpret_cast(p); + return Vec128(LoadU(du16, pu16).raw); +} +template +HWY_API Vec128 LoadU(Simd d, + const bfloat16_t* HWY_RESTRICT p) { + const RebindToUnsigned du16; + const auto pu16 = reinterpret_cast(p); + return Vec128(LoadU(du16, pu16).raw); +} + +// On ARM, Load is the same as LoadU. +template +HWY_API Vec128 Load(Simd d, const T* HWY_RESTRICT p) { + return LoadU(d, p); +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, + const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template +HWY_API Vec128 LoadDup128(Simd d, + const T* const HWY_RESTRICT p) { + return LoadU(d, p); +} + +// ------------------------------ Store 128 + +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + uint8_t* HWY_RESTRICT unaligned) { + vst1q_u8(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + uint16_t* HWY_RESTRICT unaligned) { + vst1q_u16(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + uint32_t* HWY_RESTRICT unaligned) { + vst1q_u32(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + uint64_t* HWY_RESTRICT unaligned) { + vst1q_u64(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + int8_t* HWY_RESTRICT unaligned) { + vst1q_s8(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + int16_t* HWY_RESTRICT unaligned) { + vst1q_s16(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + int32_t* HWY_RESTRICT unaligned) { + vst1q_s32(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + int64_t* HWY_RESTRICT unaligned) { + vst1q_s64(unaligned, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + float* HWY_RESTRICT unaligned) { + vst1q_f32(unaligned, v.raw); +} +#if HWY_ARCH_ARM_A64 +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + double* HWY_RESTRICT unaligned) { + vst1q_f64(unaligned, v.raw); +} +#endif + +// ------------------------------ Store 64 + +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + uint8_t* HWY_RESTRICT p) { + vst1_u8(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + uint16_t* HWY_RESTRICT p) { + vst1_u16(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + uint32_t* HWY_RESTRICT p) { + vst1_u32(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + uint64_t* HWY_RESTRICT p) { + vst1_u64(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + int8_t* HWY_RESTRICT p) { + vst1_s8(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + int16_t* HWY_RESTRICT p) { + vst1_s16(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + int32_t* HWY_RESTRICT p) { + vst1_s32(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + int64_t* HWY_RESTRICT p) { + vst1_s64(p, v.raw); +} +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + float* HWY_RESTRICT p) { + vst1_f32(p, v.raw); +} +#if HWY_ARCH_ARM_A64 +HWY_API void StoreU(const Vec64 v, Full64 /* tag */, + double* HWY_RESTRICT p) { + vst1_f64(p, v.raw); +} +#endif + +// ------------------------------ Store 32 + +HWY_API void StoreU(const Vec32 v, Full32, + uint32_t* HWY_RESTRICT p) { + vst1_lane_u32(p, v.raw, 0); +} +HWY_API void StoreU(const Vec32 v, Full32, + int32_t* HWY_RESTRICT p) { + vst1_lane_s32(p, v.raw, 0); +} +HWY_API void StoreU(const Vec32 v, Full32, + float* HWY_RESTRICT p) { + vst1_lane_f32(p, v.raw, 0); +} + +template +HWY_API void StoreU(const Vec32 v, Full32 d, T* HWY_RESTRICT p) { + const Repartition d32; + const uint32_t buf = GetLane(BitCast(d32, v)); + CopyBytes<4>(&buf, p); +} + +// ------------------------------ Store 16 + +HWY_API void StoreU(const Vec128 v, Simd, + uint16_t* HWY_RESTRICT p) { + vst1_lane_u16(p, v.raw, 0); +} +HWY_API void StoreU(const Vec128 v, Simd, + int16_t* HWY_RESTRICT p) { + vst1_lane_s16(p, v.raw, 0); +} + +template +HWY_API void StoreU(const Vec128 v, Simd d, T* HWY_RESTRICT p) { + const Repartition d16; + const uint16_t buf = GetLane(BitCast(d16, v)); + CopyBytes<2>(&buf, p); +} + +// ------------------------------ Store 8 + +HWY_API void StoreU(const Vec128 v, Simd, + uint8_t* HWY_RESTRICT p) { + vst1_lane_u8(p, v.raw, 0); +} +HWY_API void StoreU(const Vec128 v, Simd, + int8_t* HWY_RESTRICT p) { + vst1_lane_s8(p, v.raw, 0); +} + +// [b]float16_t use the same Raw as uint16_t, so forward to that. +template +HWY_API void StoreU(Vec128 v, Simd d, + float16_t* HWY_RESTRICT p) { + const RebindToUnsigned du16; + const auto pu16 = reinterpret_cast(p); + return StoreU(Vec128(v.raw), du16, pu16); +} +template +HWY_API void StoreU(Vec128 v, Simd d, + bfloat16_t* HWY_RESTRICT p) { + const RebindToUnsigned du16; + const auto pu16 = reinterpret_cast(p); + return StoreU(Vec128(v.raw), du16, pu16); +} + +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL + HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") +#endif + +// On ARM, Store is the same as StoreU. +template +HWY_API void Store(Vec128 v, Simd d, T* HWY_RESTRICT aligned) { + StoreU(v, d, aligned); +} + +HWY_DIAGNOSTICS(pop) + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, Simd d, + T* HWY_RESTRICT p) { + // Treat as unsigned so that we correctly support float16. + const RebindToUnsigned du; + const auto blended = + IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p))); + StoreU(BitCast(d, blended), d, p); +} + +// ------------------------------ Non-temporal stores + +// Same as aligned stores on non-x86. + +template +HWY_API void Stream(const Vec128 v, Simd d, + T* HWY_RESTRICT aligned) { + Store(v, d, aligned); +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend to full vector. +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + return Vec128(vmovl_u8(v.raw)); +} +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec32 v) { + uint16x8_t a = vmovl_u8(v.raw); + return Vec128(vmovl_u16(vget_low_u16(a))); +} +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + return Vec128(vmovl_u16(v.raw)); +} +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + return Vec128(vmovl_u32(v.raw)); +} +HWY_API Vec128 PromoteTo(Full128 d, const Vec64 v) { + return BitCast(d, Vec128(vmovl_u8(v.raw))); +} +HWY_API Vec128 PromoteTo(Full128 d, const Vec32 v) { + uint16x8_t a = vmovl_u8(v.raw); + return BitCast(d, Vec128(vmovl_u16(vget_low_u16(a)))); +} +HWY_API Vec128 PromoteTo(Full128 d, const Vec64 v) { + return BitCast(d, Vec128(vmovl_u16(v.raw))); +} + +// Unsigned: zero-extend to half vector. +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vget_low_u16(vmovl_u8(v.raw))); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + uint16x8_t a = vmovl_u8(v.raw); + return Vec128(vget_low_u32(vmovl_u16(vget_low_u16(a)))); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vget_low_u32(vmovl_u16(v.raw))); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vget_low_u64(vmovl_u32(v.raw))); +} +template +HWY_API Vec128 PromoteTo(Simd d, + const Vec128 v) { + return BitCast(d, Vec128(vget_low_u16(vmovl_u8(v.raw)))); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + uint16x8_t a = vmovl_u8(v.raw); + uint32x4_t b = vmovl_u16(vget_low_u16(a)); + return Vec128(vget_low_s32(vreinterpretq_s32_u32(b))); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + uint32x4_t a = vmovl_u16(v.raw); + return Vec128(vget_low_s32(vreinterpretq_s32_u32(a))); +} + +// Signed: replicate sign bit to full vector. +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + return Vec128(vmovl_s8(v.raw)); +} +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec32 v) { + int16x8_t a = vmovl_s8(v.raw); + return Vec128(vmovl_s16(vget_low_s16(a))); +} +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + return Vec128(vmovl_s16(v.raw)); +} +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + return Vec128(vmovl_s32(v.raw)); +} + +// Signed: replicate sign bit to half vector. +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vget_low_s16(vmovl_s8(v.raw))); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + int16x8_t a = vmovl_s8(v.raw); + int32x4_t b = vmovl_s16(vget_low_s16(a)); + return Vec128(vget_low_s32(b)); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vget_low_s32(vmovl_s16(v.raw))); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vget_low_s64(vmovl_s32(v.raw))); +} + +#if __ARM_FP & 2 + +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec128 v) { + const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); + return Vec128(f32); +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); + return Vec128(vget_low_f32(f32)); +} + +#else + +template +HWY_API Vec128 PromoteTo(Simd df32, + const Vec128 v) { + const RebindToSigned di32; + const RebindToUnsigned du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, Vec128{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +} + +#endif + +#if HWY_ARCH_ARM_A64 + +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + return Vec128(vcvt_f64_f32(v.raw)); +} + +HWY_API Vec64 PromoteTo(Full64 /* tag */, + const Vec32 v) { + return Vec64(vget_low_f64(vcvt_f64_f32(v.raw))); +} + +HWY_API Vec128 PromoteTo(Full128 /* tag */, + const Vec64 v) { + const int64x2_t i64 = vmovl_s32(v.raw); + return Vec128(vcvtq_f64_s64(i64)); +} + +HWY_API Vec64 PromoteTo(Full64 /* tag */, + const Vec32 v) { + const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw)); + return Vec64(vcvt_f64_s64(i64)); +} + +#endif + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +// From full vector to half or quarter +HWY_API Vec64 DemoteTo(Full64 /* tag */, + const Vec128 v) { + return Vec64(vqmovun_s32(v.raw)); +} +HWY_API Vec64 DemoteTo(Full64 /* tag */, + const Vec128 v) { + return Vec64(vqmovn_s32(v.raw)); +} +HWY_API Vec32 DemoteTo(Full32 /* tag */, + const Vec128 v) { + const uint16x4_t a = vqmovun_s32(v.raw); + return Vec32(vqmovn_u16(vcombine_u16(a, a))); +} +HWY_API Vec64 DemoteTo(Full64 /* tag */, + const Vec128 v) { + return Vec64(vqmovun_s16(v.raw)); +} +HWY_API Vec32 DemoteTo(Full32 /* tag */, + const Vec128 v) { + const int16x4_t a = vqmovn_s32(v.raw); + return Vec32(vqmovn_s16(vcombine_s16(a, a))); +} +HWY_API Vec64 DemoteTo(Full64 /* tag */, + const Vec128 v) { + return Vec64(vqmovn_s16(v.raw)); +} + +// From half vector to partial half +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vqmovun_s32(vcombine_s32(v.raw, v.raw))); +} +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vqmovn_s32(vcombine_s32(v.raw, v.raw))); +} +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw)); + return Vec128(vqmovn_u16(vcombine_u16(a, a))); +} +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vqmovun_s16(vcombine_s16(v.raw, v.raw))); +} +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw)); + return Vec128(vqmovn_s16(vcombine_s16(a, a))); +} +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vqmovn_s16(vcombine_s16(v.raw, v.raw))); +} + +#if __ARM_FP & 2 + +HWY_API Vec128 DemoteTo(Full64 /* tag */, + const Vec128 v) { + return Vec128{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))}; +} +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw)); + return Vec128(vreinterpret_u16_f16(f16)); +} + +#else + +template +HWY_API Vec128 DemoteTo(Simd df16, + const Vec128 v) { + const RebindToUnsigned du16; + const Rebind du; + const RebindToSigned di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return Vec128(DemoteTo(du16, bits16).raw); +} + +#endif + +template +HWY_API Vec128 DemoteTo(Simd dbf16, + const Vec128 v) { + const Rebind di32; + const Rebind du32; // for logical shift right + const Rebind du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +#if HWY_ARCH_ARM_A64 + +HWY_API Vec64 DemoteTo(Full64 /* tag */, const Vec128 v) { + return Vec64(vcvt_f32_f64(v.raw)); +} +HWY_API Vec32 DemoteTo(Full32 /* tag */, const Vec64 v) { + return Vec32(vcvt_f32_f64(vcombine_f64(v.raw, v.raw))); +} + +HWY_API Vec64 DemoteTo(Full64 /* tag */, + const Vec128 v) { + const int64x2_t i64 = vcvtq_s64_f64(v.raw); + return Vec64(vqmovn_s64(i64)); +} +HWY_API Vec32 DemoteTo(Full32 /* tag */, + const Vec64 v) { + const int64x1_t i64 = vcvt_s64_f64(v.raw); + // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first. + const int64x2_t i64x2 = vcombine_s64(i64, i64); + return Vec32(vqmovn_s64(i64x2)); +} + +#endif + +HWY_API Vec32 U8FromU32(const Vec128 v) { + const uint8x16_t org_v = detail::BitCastToByte(v).raw; + const uint8x16_t w = vuzp1q_u8(org_v, org_v); + return Vec32(vget_low_u8(vuzp1q_u8(w, w))); +} +template +HWY_API Vec128 U8FromU32(const Vec128 v) { + const uint8x8_t org_v = detail::BitCastToByte(v).raw; + const uint8x8_t w = vuzp1_u8(org_v, org_v); + return Vec128(vuzp1_u8(w, w)); +} + +// In the following DemoteTo functions, |b| is purposely undefined. +// The value a needs to be extended to 128 bits so that vqmovn can be +// used and |b| is undefined so that no extra overhead is introduced. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + Vec128 a = DemoteTo(Simd(), v); + Vec128 b; + uint16x8_t c = vcombine_u16(a.raw, b.raw); + return Vec128(vqmovn_u16(c)); +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + Vec128 a = DemoteTo(Simd(), v); + Vec128 b; + int16x8_t c = vcombine_s16(a.raw, b.raw); + return Vec128(vqmovn_s16(c)); +} + +HWY_DIAGNOSTICS(pop) + +// ------------------------------ Convert integer <=> floating-point + +HWY_API Vec128 ConvertTo(Full128 /* tag */, + const Vec128 v) { + return Vec128(vcvtq_f32_s32(v.raw)); +} +template +HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vcvt_f32_s32(v.raw)); +} + +HWY_API Vec128 ConvertTo(Full128 /* tag */, + const Vec128 v) { + return Vec128(vcvtq_f32_u32(v.raw)); +} +template +HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vcvt_f32_u32(v.raw)); +} + +// Truncates (rounds toward zero). +HWY_API Vec128 ConvertTo(Full128 /* tag */, + const Vec128 v) { + return Vec128(vcvtq_s32_f32(v.raw)); +} +template +HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { + return Vec128(vcvt_s32_f32(v.raw)); +} + +#if HWY_ARCH_ARM_A64 + +HWY_API Vec128 ConvertTo(Full128 /* tag */, + const Vec128 v) { + return Vec128(vcvtq_f64_s64(v.raw)); +} +HWY_API Vec64 ConvertTo(Full64 /* tag */, + const Vec64 v) { + return Vec64(vcvt_f64_s64(v.raw)); +} + +HWY_API Vec128 ConvertTo(Full128 /* tag */, + const Vec128 v) { + return Vec128(vcvtq_f64_u64(v.raw)); +} +HWY_API Vec64 ConvertTo(Full64 /* tag */, + const Vec64 v) { + return Vec64(vcvt_f64_u64(v.raw)); +} + +// Truncates (rounds toward zero). +HWY_API Vec128 ConvertTo(Full128 /* tag */, + const Vec128 v) { + return Vec128(vcvtq_s64_f64(v.raw)); +} +HWY_API Vec64 ConvertTo(Full64 /* tag */, + const Vec64 v) { + return Vec64(vcvt_s64_f64(v.raw)); +} + +#endif + +// ------------------------------ Round (IfThenElse, mask, logical) + +#if HWY_ARCH_ARM_A64 +// Toward nearest integer +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) + +// Toward zero, aka truncate +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1) + +// Toward +infinity, aka ceiling +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1) + +// Toward -infinity, aka floor +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1) +#else + +// ------------------------------ Trunc + +// ARMv7 only supports truncation to integer. We can either convert back to +// float (3 floating-point and 2 logic operations) or manipulate the binary32 +// representation, clearing the lowest 23-exp mantissa bits. This requires 9 +// integer operations and 3 constants, which is likely more expensive. + +namespace detail { + +// The original value is already the desired result if NaN or the magnitude is +// large (i.e. the value is already an integer). +template +HWY_INLINE Mask128 UseInt(const Vec128 v) { + return Abs(v) < Set(Simd(), MantissaEnd()); +} + +} // namespace detail + +template +HWY_API Vec128 Trunc(const Vec128 v) { + const DFromV df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), int_f, v); +} + +template +HWY_API Vec128 Round(const Vec128 v) { + const DFromV df; + + // ARMv7 also lacks a native NearestInt, but we can instead rely on rounding + // (we assume the current mode is nearest-even) after addition with a large + // value such that no mantissa bits remain. We may need a compiler flag for + // precise floating-point to prevent this from being "optimized" out. + const auto max = Set(df, MantissaEnd()); + const auto large = CopySignToAbs(max, v); + const auto added = large + v; + const auto rounded = added - large; + + // Keep original if NaN or the magnitude is large (already an int). + return IfThenElse(Abs(v) < max, rounded, v); +} + +template +HWY_API Vec128 Ceil(const Vec128 v) { + const DFromV df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a positive non-integer ends up smaller; if so, add 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); + + return IfThenElse(detail::UseInt(v), int_f - neg1, v); +} + +template +HWY_API Vec128 Floor(const Vec128 v) { + const DFromV df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a negative non-integer ends up larger; if so, subtract 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); + + return IfThenElse(detail::UseInt(v), int_f + neg1, v); +} + +#endif + +// ------------------------------ NearestInt (Round) + +#if HWY_ARCH_ARM_A64 + +HWY_API Vec128 NearestInt(const Vec128 v) { + return Vec128(vcvtnq_s32_f32(v.raw)); +} +template +HWY_API Vec128 NearestInt(const Vec128 v) { + return Vec128(vcvtn_s32_f32(v.raw)); +} + +#else + +template +HWY_API Vec128 NearestInt(const Vec128 v) { + const RebindToSigned> di; + return ConvertTo(di, Round(v)); +} + +#endif + +// ------------------------------ Floating-point classification +template +HWY_API Mask128 IsNaN(const Vec128 v) { + return v != v; +} + +template +HWY_API Mask128 IsInf(const Vec128 v) { + const Simd d; + const RebindToSigned di; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); +} + +// Returns whether normal/subnormal/zero. +template +HWY_API Mask128 IsFinite(const Vec128 v) { + const Simd d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + const VFromD vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD exp = + BitCast(di, ShiftRight() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); +} + +// ================================================== SWIZZLE + +// ------------------------------ LowerHalf + +// <= 64 bit: just return different type +template +HWY_API Vec128 LowerHalf(const Vec128 v) { + return Vec128(v.raw); +} + +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_u8(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_u16(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_u32(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_u64(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_s8(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_s16(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_s32(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_s64(v.raw)); +} +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_f32(v.raw)); +} +#if HWY_ARCH_ARM_A64 +HWY_API Vec64 LowerHalf(const Vec128 v) { + return Vec64(vget_low_f64(v.raw)); +} +#endif +HWY_API Vec64 LowerHalf(const Vec128 v) { + const Full128 du; + const Full64 dbh; + return BitCast(dbh, LowerHalf(BitCast(du, v))); +} + +template +HWY_API Vec128 LowerHalf(Simd /* tag */, + Vec128 v) { + return LowerHalf(v); +} + +// ------------------------------ CombineShiftRightBytes + +// 128-bit +template > +HWY_API V128 CombineShiftRightBytes(Full128 d, V128 hi, V128 lo) { + static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]"); + const Repartition d8; + uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); + return BitCast(d, Vec128(v8)); +} + +// 64-bit +template +HWY_API Vec64 CombineShiftRightBytes(Full64 d, Vec64 hi, Vec64 lo) { + static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]"); + const Repartition d8; + uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); + return BitCast(d, VFromD(v8)); +} + +// <= 32-bit defined after ShiftLeftBytes. + +// ------------------------------ Shift vector by constant #bytes + +namespace detail { + +// Partially specialize because kBytes = 0 and >= size are compile errors; +// callers replace the latter with 0xFF for easier specialization. +template +struct ShiftLeftBytesT { + // Full + template + HWY_INLINE Vec128 operator()(const Vec128 v) { + const Full128 d; + return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d)); + } + + // Partial + template + HWY_INLINE Vec128 operator()(const Vec128 v) { + // Expand to 64-bit so we only use the native EXT instruction. + const Full64 d64; + const auto zero64 = Zero(d64); + const decltype(zero64) v64(v.raw); + return Vec128( + CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw); + } +}; +template <> +struct ShiftLeftBytesT<0> { + template + HWY_INLINE Vec128 operator()(const Vec128 v) { + return v; + } +}; +template <> +struct ShiftLeftBytesT<0xFF> { + template + HWY_INLINE Vec128 operator()(const Vec128 /* v */) { + return Zero(Simd()); + } +}; + +template +struct ShiftRightBytesT { + template + HWY_INLINE Vec128 operator()(Vec128 v) { + const Simd d; + // For < 64-bit vectors, zero undefined lanes so we shift in zeros. + if (N * sizeof(T) < 8) { + constexpr size_t kReg = N * sizeof(T) == 16 ? 16 : 8; + const Simd dreg; + v = Vec128( + IfThenElseZero(FirstN(dreg, N), VFromD(v.raw)).raw); + } + return CombineShiftRightBytes(d, Zero(d), v); + } +}; +template <> +struct ShiftRightBytesT<0> { + template + HWY_INLINE Vec128 operator()(const Vec128 v) { + return v; + } +}; +template <> +struct ShiftRightBytesT<0xFF> { + template + HWY_INLINE Vec128 operator()(const Vec128 /* v */) { + return Zero(Simd()); + } +}; + +} // namespace detail + +template +HWY_API Vec128 ShiftLeftBytes(Simd /* tag */, Vec128 v) { + return detail::ShiftLeftBytesT < kBytes >= N * sizeof(T) ? 0xFF + : kBytes > ()(v); +} + +template +HWY_API Vec128 ShiftLeftBytes(const Vec128 v) { + return ShiftLeftBytes(Simd(), v); +} + +template +HWY_API Vec128 ShiftLeftLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); +} + +template +HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { + return ShiftLeftLanes(Simd(), v); +} + +// 0x01..0F, kBytes = 1 => 0x0001..0E +template +HWY_API Vec128 ShiftRightBytes(Simd /* tag */, Vec128 v) { + return detail::ShiftRightBytesT < kBytes >= N * sizeof(T) ? 0xFF + : kBytes > ()(v); +} + +template +HWY_API Vec128 ShiftRightLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); +} + +// Calls ShiftLeftBytes +template +HWY_API Vec128 CombineShiftRightBytes(Simd d, Vec128 hi, + Vec128 lo) { + constexpr size_t kSize = N * sizeof(T); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + const Repartition d8; + const Full64 d_full8; + const Repartition d_full; + using V64 = VFromD; + const V64 hi64(BitCast(d8, hi).raw); + // Move into most-significant bytes + const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw)); + const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64); + // After casting to full 64-bit vector of correct type, shrink to 32-bit + return Vec128(BitCast(d_full, r).raw); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +// Full input +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_u8(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_u16(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_u32(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_u64(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_s8(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_s16(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_s32(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_s64(v.raw)); +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, const Vec128 v) { + return Vec64(vget_high_f32(v.raw)); +} +#if HWY_ARCH_ARM_A64 +HWY_API Vec64 UpperHalf(Full64 /* tag */, + const Vec128 v) { + return Vec64(vget_high_f64(v.raw)); +} +#endif + +HWY_API Vec64 UpperHalf(Full64 dbh, + const Vec128 v) { + const RebindToUnsigned duh; + const Twice du; + return BitCast(dbh, UpperHalf(duh, BitCast(du, v))); +} + +// Partial +template +HWY_API Vec128 UpperHalf(Half> /* tag */, + Vec128 v) { + const DFromV d; + const RebindToUnsigned du; + const auto vu = BitCast(du, v); + const auto upper = BitCast(d, ShiftRightBytes(du, vu)); + return Vec128(upper.raw); +} + +// ------------------------------ Broadcast/splat any lane + +#if HWY_ARCH_ARM_A64 +// Unsigned +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128(vdupq_laneq_u16(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_u16(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128(vdupq_laneq_u32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_u32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128(vdupq_laneq_u64(v.raw, kLane)); +} +// Vec64 is defined below. + +// Signed +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128(vdupq_laneq_s16(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_s16(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128(vdupq_laneq_s32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_s32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128(vdupq_laneq_s64(v.raw, kLane)); +} +// Vec64 is defined below. + +// Float +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128(vdupq_laneq_f32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_f32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128(vdupq_laneq_f64(v.raw, kLane)); +} +template +HWY_API Vec64 Broadcast(const Vec64 v) { + static_assert(0 <= kLane && kLane < 1, "Invalid lane"); + return v; +} + +#else +// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*. + +// Unsigned +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane))); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_u16(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane))); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_u32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane))); +} +// Vec64 is defined below. + +// Signed +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + return Vec128(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane))); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_s16(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane))); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_s32(v.raw, kLane)); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec128(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane))); +} +// Vec64 is defined below. + +// Float +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec128(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane))); +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128(vdup_lane_f32(v.raw, kLane)); +} + +#endif + +template +HWY_API Vec64 Broadcast(const Vec64 v) { + static_assert(0 <= kLane && kLane < 1, "Invalid lane"); + return v; +} +template +HWY_API Vec64 Broadcast(const Vec64 v) { + static_assert(0 <= kLane && kLane < 1, "Invalid lane"); + return v; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template +struct Indices128 { + typename detail::Raw128::type raw; +}; + +template +HWY_API Indices128 IndicesFromVec(Simd d, Vec128 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast(N))))); +#endif + + const Repartition d8; + using V8 = VFromD; + const Repartition d16; + + // Broadcast each lane index to all bytes of T and shift to bytes + static_assert(sizeof(T) == 4 || sizeof(T) == 8, ""); + if (sizeof(T) == 4) { + alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + const V8 lane_indices = + TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); + const V8 byte_indices = + BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); + alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3}; + const V8 sum = Add(byte_indices, Load(d8, kByteOffsets)); + return Indices128{BitCast(d, sum).raw}; + } else { + alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; + const V8 lane_indices = + TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); + const V8 byte_indices = + BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices))); + alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7}; + const V8 sum = Add(byte_indices, Load(d8, kByteOffsets)); + return Indices128{BitCast(d, sum).raw}; + } +} + +template +HWY_API Indices128 SetTableIndices(Simd d, const TI* idx) { + const Rebind di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template +HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { + const DFromV d; + const RebindToSigned di; + return BitCast( + d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128{idx.raw}))); +} + +// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) + +// Single lane: no change +template +HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { + return v; +} + +// Two lanes: shuffle +template +HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { + return Vec128(Shuffle2301(v)); +} + +template +HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { + return Shuffle01(v); +} + +// Four lanes: shuffle +template +HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { + return Shuffle0123(v); +} + +// 16-bit +template +HWY_API Vec128 Reverse(Simd d, const Vec128 v) { + const RepartitionToWide> du32; + return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); +} + +// ------------------------------ Reverse2 + +template +HWY_API Vec128 Reverse2(Simd d, const Vec128 v) { + const RebindToUnsigned du; + return BitCast(d, Vec128(vrev32_u16(BitCast(du, v).raw))); +} +template +HWY_API Vec128 Reverse2(Full128 d, const Vec128 v) { + const RebindToUnsigned du; + return BitCast(d, Vec128(vrev32q_u16(BitCast(du, v).raw))); +} + +template +HWY_API Vec128 Reverse2(Simd d, const Vec128 v) { + const RebindToUnsigned du; + return BitCast(d, Vec128(vrev64_u32(BitCast(du, v).raw))); +} +template +HWY_API Vec128 Reverse2(Full128 d, const Vec128 v) { + const RebindToUnsigned du; + return BitCast(d, Vec128(vrev64q_u32(BitCast(du, v).raw))); +} + +template +HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 + +template +HWY_API Vec128 Reverse4(Simd d, const Vec128 v) { + const RebindToUnsigned du; + return BitCast(d, Vec128(vrev64_u16(BitCast(du, v).raw))); +} +template +HWY_API Vec128 Reverse4(Full128 d, const Vec128 v) { + const RebindToUnsigned du; + return BitCast(d, Vec128(vrev64q_u16(BitCast(du, v).raw))); +} + +template +HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128 v) { + return Shuffle0123(v); +} + +template +HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128) { + HWY_ASSERT(0); // don't have 8 u64 lanes +} + +// ------------------------------ Reverse8 + +template +HWY_API Vec128 Reverse8(Simd d, const Vec128 v) { + return Reverse(d, v); +} + +template +HWY_API Vec128 Reverse8(Simd, const Vec128) { + HWY_ASSERT(0); // don't have 8 lanes unless 16-bit +} + +// ------------------------------ Other shuffles (TableLookupBytes) + +// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 64-bit halves +template +HWY_API Vec128 Shuffle1032(const Vec128 v) { + return CombineShiftRightBytes<8>(Full128(), v, v); +} +template +HWY_API Vec128 Shuffle01(const Vec128 v) { + return CombineShiftRightBytes<8>(Full128(), v, v); +} + +// Rotate right 32 bits +template +HWY_API Vec128 Shuffle0321(const Vec128 v) { + return CombineShiftRightBytes<4>(Full128(), v, v); +} + +// Rotate left 32 bits +template +HWY_API Vec128 Shuffle2103(const Vec128 v) { + return CombineShiftRightBytes<12>(Full128(), v, v); +} + +// Reverse +template +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Shuffle2301(Shuffle1032(v)); +} + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). +HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveLower, vzip1, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveLower, vzip1, _, 2) + +#if HWY_ARCH_ARM_A64 +// N=1 makes no sense (in that case, there would be no upper/lower). +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128(vzip1q_u64(a.raw, b.raw)); +} +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128(vzip1q_s64(a.raw, b.raw)); +} +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128(vzip1q_f64(a.raw, b.raw)); +} +#else +// ARMv7 emulation. +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return CombineShiftRightBytes<8>(Full128(), b, Shuffle01(a)); +} +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return CombineShiftRightBytes<8>(Full128(), b, Shuffle01(a)); +} +#endif + +// Floats +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128(vzip1q_f32(a.raw, b.raw)); +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128(vzip1_f32(a.raw, b.raw)); +} + +// < 64 bit parts +template +HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { + return Vec128(InterleaveLower(Vec64(a.raw), Vec64(b.raw)).raw); +} + +// Additional overload for the optional Simd<> tag. +template > +HWY_API V InterleaveLower(Simd /* tag */, V a, V b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// All functions inside detail lack the required D parameter. +namespace detail { +HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2) + +#if HWY_ARCH_ARM_A64 +// N=1 makes no sense (in that case, there would be no upper/lower). +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128(vzip2q_u64(a.raw, b.raw)); +} +HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { + return Vec128(vzip2q_s64(a.raw, b.raw)); +} +HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { + return Vec128(vzip2q_f64(a.raw, b.raw)); +} +#else +// ARMv7 emulation. +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return CombineShiftRightBytes<8>(Full128(), Shuffle01(b), a); +} +HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { + return CombineShiftRightBytes<8>(Full128(), Shuffle01(b), a); +} +#endif + +HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { + return Vec128(vzip2q_f32(a.raw, b.raw)); +} +HWY_API Vec64 InterleaveUpper(const Vec64 a, + const Vec64 b) { + return Vec64(vzip2_f32(a.raw, b.raw)); +} + +} // namespace detail + +// Full register +template > +HWY_API V InterleaveUpper(Simd /* tag */, V a, V b) { + return detail::InterleaveUpper(a, b); +} + +// Partial +template > +HWY_API V InterleaveUpper(Simd d, V a, V b) { + const Half d2; + return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw)); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template >> +HWY_API VFromD ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template , class DW = RepartitionToWide> +HWY_API VFromD ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template , class DW = RepartitionToWide> +HWY_API VFromD ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template +HWY_API Vec128 ReorderWidenMulAccumulate(Simd df32, + Vec128 a, + Vec128 b, + const Vec128 sum0, + Vec128& sum1) { + const Repartition du16; + const RebindToUnsigned du32; + const Vec128 zero = Zero(du16); + const Vec128 a0 = ZipLower(du32, zero, BitCast(du16, a)); + const Vec128 a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const Vec128 b0 = ZipLower(du32, zero, BitCast(du16, b)); + const Vec128 b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +HWY_API Vec128 ReorderWidenMulAccumulate(Full128 /*d32*/, + Vec128 a, + Vec128 b, + const Vec128 sum0, + Vec128& sum1) { +#if HWY_ARCH_ARM_A64 + sum1 = Vec128(vmlal_high_s16(sum1.raw, a.raw, b.raw)); +#else + const Full64 dh; + sum1 = Vec128( + vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); +#endif + return Vec128( + vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); +} + +HWY_API Vec64 ReorderWidenMulAccumulate(Full64 d32, + Vec64 a, + Vec64 b, + const Vec64 sum0, + Vec64& sum1) { + // vmlal writes into the upper half, which the caller cannot use, so + // split into two halves. + const Vec128 mul_3210(vmull_s16(a.raw, b.raw)); + const Vec64 mul_32 = UpperHalf(d32, mul_3210); + sum1 += mul_32; + return sum0 + LowerHalf(mul_3210); +} + +HWY_API Vec32 ReorderWidenMulAccumulate(Full32 d32, + Vec32 a, + Vec32 b, + const Vec32 sum0, + Vec32& sum1) { + const Vec128 mul_xx10(vmull_s16(a.raw, b.raw)); + const Vec64 mul_10(LowerHalf(mul_xx10)); + const Vec32 mul0 = LowerHalf(d32, mul_10); + const Vec32 mul1 = UpperHalf(d32, mul_10); + sum1 += mul1; + return sum0 + mul0; +} + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) + +// Full result +HWY_API Vec128 Combine(Full128 /* tag */, Vec64 hi, + Vec64 lo) { + return Vec128(vcombine_u8(lo.raw, hi.raw)); +} +HWY_API Vec128 Combine(Full128 /* tag */, + Vec64 hi, Vec64 lo) { + return Vec128(vcombine_u16(lo.raw, hi.raw)); +} +HWY_API Vec128 Combine(Full128 /* tag */, + Vec64 hi, Vec64 lo) { + return Vec128(vcombine_u32(lo.raw, hi.raw)); +} +HWY_API Vec128 Combine(Full128 /* tag */, + Vec64 hi, Vec64 lo) { + return Vec128(vcombine_u64(lo.raw, hi.raw)); +} + +HWY_API Vec128 Combine(Full128 /* tag */, Vec64 hi, + Vec64 lo) { + return Vec128(vcombine_s8(lo.raw, hi.raw)); +} +HWY_API Vec128 Combine(Full128 /* tag */, Vec64 hi, + Vec64 lo) { + return Vec128(vcombine_s16(lo.raw, hi.raw)); +} +HWY_API Vec128 Combine(Full128 /* tag */, Vec64 hi, + Vec64 lo) { + return Vec128(vcombine_s32(lo.raw, hi.raw)); +} +HWY_API Vec128 Combine(Full128 /* tag */, Vec64 hi, + Vec64 lo) { + return Vec128(vcombine_s64(lo.raw, hi.raw)); +} + +HWY_API Vec128 Combine(Full128 /* tag */, Vec64 hi, + Vec64 lo) { + return Vec128(vcombine_f32(lo.raw, hi.raw)); +} +#if HWY_ARCH_ARM_A64 +HWY_API Vec128 Combine(Full128 /* tag */, Vec64 hi, + Vec64 lo) { + return Vec128(vcombine_f64(lo.raw, hi.raw)); +} +#endif + +// < 64bit input, <= 64 bit result +template +HWY_API Vec128 Combine(Simd d, Vec128 hi, + Vec128 lo) { + // First double N (only lower halves will be used). + const Vec128 hi2(hi.raw); + const Vec128 lo2(lo.raw); + // Repartition to two unsigned lanes (each the size of the valid input). + const Simd, 2, 0> du; + return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2))); +} + +// ------------------------------ ZeroExtendVector (Combine) + +template +HWY_API Vec128 ZeroExtendVector(Simd d, Vec128 lo) { + return Combine(d, Zero(Half()), lo); +} + +// ------------------------------ ConcatLowerLower + +// 64 or 128-bit input: just interleave +template +HWY_API Vec128 ConcatLowerLower(const Simd d, Vec128 hi, + Vec128 lo) { + // Treat half-width input as a single lane and interleave them. + const Repartition, decltype(d)> du; + return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi))); +} + +namespace detail { +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveEven, vtrn1, _, 2) +HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveOdd, vtrn2, _, 2) +#else + +// vtrn returns a struct with even and odd result. +#define HWY_NEON_BUILD_TPL_HWY_TRN +#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t +// Pass raw args so we can accept uint16x2 args, for which there is no +// corresponding uint16x2x2 return type. +#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \ + Raw128::type a, Raw128::type b +#define HWY_NEON_BUILD_ARG_HWY_TRN a, b + +// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined +// for full and half vectors. +HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN) +HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN) +#endif +} // namespace detail + +// <= 32-bit input/output +template +HWY_API Vec128 ConcatLowerLower(const Simd d, Vec128 hi, + Vec128 lo) { + // Treat half-width input as two lanes and take every second one. + const Repartition, decltype(d)> du; +#if HWY_ARCH_ARM_A64 + return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi))); +#else + using VU = VFromD; + return BitCast( + d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) + .val[0])); +#endif +} + +// ------------------------------ ConcatUpperUpper + +// 64 or 128-bit input: just interleave +template +HWY_API Vec128 ConcatUpperUpper(const Simd d, Vec128 hi, + Vec128 lo) { + // Treat half-width input as a single lane and interleave them. + const Repartition, decltype(d)> du; + return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi))); +} + +// <= 32-bit input/output +template +HWY_API Vec128 ConcatUpperUpper(const Simd d, Vec128 hi, + Vec128 lo) { + // Treat half-width input as two lanes and take every second one. + const Repartition, decltype(d)> du; +#if HWY_ARCH_ARM_A64 + return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi))); +#else + using VU = VFromD; + return BitCast( + d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) + .val[1])); +#endif +} + +// ------------------------------ ConcatLowerUpper (ShiftLeftBytes) + +// 64 or 128-bit input: extract from concatenated +template +HWY_API Vec128 ConcatLowerUpper(const Simd d, Vec128 hi, + Vec128 lo) { + return CombineShiftRightBytes(d, hi, lo); +} + +// <= 32-bit input/output +template +HWY_API Vec128 ConcatLowerUpper(const Simd d, Vec128 hi, + Vec128 lo) { + constexpr size_t kSize = N * sizeof(T); + const Repartition d8; + const Full64 d8x8; + const Full64 d64; + using V8x8 = VFromD; + const V8x8 hi8x8(BitCast(d8, hi).raw); + // Move into most-significant bytes + const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw)); + const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8); + // Back to original lane type, then shrink N. + return Vec128(BitCast(d64, r).raw); +} + +// ------------------------------ ConcatUpperLower + +// Works for all N. +template +HWY_API Vec128 ConcatUpperLower(Simd d, Vec128 hi, + Vec128 lo) { + return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); +} + +// ------------------------------ ConcatOdd (InterleaveUpper) + +namespace detail { +// There is no vuzpq_u64. +HWY_NEON_DEF_FUNCTION_UIF81632(ConcatEven, vuzp1, _, 2) +HWY_NEON_DEF_FUNCTION_UIF81632(ConcatOdd, vuzp2, _, 2) +} // namespace detail + +// Full/half vector +template = 8>* = nullptr> +HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, + Vec128 lo) { + return detail::ConcatOdd(lo, hi); +} + +// 8-bit x4 +template +HWY_API Vec128 ConcatOdd(Simd d, Vec128 hi, + Vec128 lo) { + const Twice d2; + const Repartition dw2; + const VFromD hi2(hi.raw); + const VFromD lo2(lo.raw); + const VFromD Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2)); + // Compact into two pairs of u8, skipping the invalid x lanes. Could also use + // vcopy_lane_u16, but that's A64-only. + return Vec128(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw); +} + +// Any type x2 +template +HWY_API Vec128 ConcatOdd(Simd d, Vec128 hi, + Vec128 lo) { + return InterleaveUpper(d, lo, hi); +} + +// ------------------------------ ConcatEven (InterleaveLower) + +// Full/half vector +template = 8>* = nullptr> +HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, + Vec128 lo) { + return detail::ConcatEven(lo, hi); +} + +// 8-bit x4 +template +HWY_API Vec128 ConcatEven(Simd d, Vec128 hi, + Vec128 lo) { + const Twice d2; + const Repartition dw2; + const VFromD hi2(hi.raw); + const VFromD lo2(lo.raw); + const VFromD Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2)); + // Compact into two pairs of u8, skipping the invalid x lanes. Could also use + // vcopy_lane_u16, but that's A64-only. + return Vec128(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw); +} + +// Any type x2 +template +HWY_API Vec128 ConcatEven(Simd d, Vec128 hi, + Vec128 lo) { + return InterleaveLower(d, lo, hi); +} + +// ------------------------------ DupEven (InterleaveLower) + +template +HWY_API Vec128 DupEven(Vec128 v) { +#if HWY_ARCH_ARM_A64 + return detail::InterleaveEven(v, v); +#else + return Vec128(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]); +#endif +} + +template +HWY_API Vec128 DupEven(const Vec128 v) { + return InterleaveLower(Simd(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template +HWY_API Vec128 DupOdd(Vec128 v) { +#if HWY_ARCH_ARM_A64 + return detail::InterleaveOdd(v, v); +#else + return Vec128(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]); +#endif +} + +template +HWY_API Vec128 DupOdd(const Vec128 v) { + return InterleaveUpper(Simd(), v, v); +} + +// ------------------------------ OddEven (IfThenElse) + +template +HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { + const Simd d; + const Repartition d8; + alignas(16) constexpr uint8_t kBytes[16] = { + ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF, + ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF, + ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF, + ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF, + ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF, + ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF, + ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF, + ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF, + }; + const auto vec = BitCast(d, Load(d8, kBytes)); + return IfThenElse(MaskFromVec(vec), b, a); +} + +// ------------------------------ OddEvenBlocks +template +HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { + return v; +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template +HWY_API Vec128 ReverseBlocks(Full128 /* tag */, const Vec128 v) { + return v; +} + +// ------------------------------ ReorderDemote2To (OddEven) + +template +HWY_API Vec128 ReorderDemote2To( + Simd dbf16, Vec128 a, Vec128 b) { + const RebindToUnsigned du16; + const Repartition du32; + const Vec128 b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +HWY_API Vec128 ReorderDemote2To(Full128 d16, + Vec128 a, Vec128 b) { + const Vec64 a16(vqmovn_s32(a.raw)); +#if HWY_ARCH_ARM_A64 + (void)d16; + return Vec128(vqmovn_high_s32(a16.raw, b.raw)); +#else + const Vec64 b16(vqmovn_s32(b.raw)); + return Combine(d16, a16, b16); +#endif +} + +HWY_API Vec64 ReorderDemote2To(Full64 /*d16*/, + Vec64 a, Vec64 b) { + const Full128 d32; + const Vec128 ab = Combine(d32, a, b); + return Vec64(vqmovn_s32(ab.raw)); +} + +HWY_API Vec32 ReorderDemote2To(Full32 /*d16*/, + Vec32 a, Vec32 b) { + const Full128 d32; + const Vec64 ab(vzip1_s32(a.raw, b.raw)); + return Vec32(vqmovn_s32(Combine(d32, ab, ab).raw)); +} + +// ================================================== CRYPTO + +#if defined(__ARM_FEATURE_AES) || \ + (HWY_HAVE_RUNTIME_DISPATCH && HWY_ARCH_ARM_A64) + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API Vec128 AESRound(Vec128 state, + Vec128 round_key) { + // NOTE: it is important that AESE and AESMC be consecutive instructions so + // they can be fused. AESE includes AddRoundKey, which is a different ordering + // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual + // round key (the compiler will hopefully optimize this for multiple rounds). + return Vec128(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^ + round_key; +} + +HWY_API Vec128 AESLastRound(Vec128 state, + Vec128 round_key) { + return Vec128(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; +} + +HWY_API Vec128 CLMulLower(Vec128 a, Vec128 b) { + return Vec128((uint64x2_t)vmull_p64(GetLane(a), GetLane(b))); +} + +HWY_API Vec128 CLMulUpper(Vec128 a, Vec128 b) { + return Vec128( + (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw)); +} + +#endif // __ARM_FEATURE_AES + +// ================================================== MISC + +template +HWY_API Vec128 PromoteTo(Simd df32, + const Vec128 v) { + const Rebind du16; + const RebindToSigned di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ------------------------------ Truncations + +template * = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + return Vec128{v1.raw}; +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + const auto v3 = detail::ConcatEven(v2, v2); + const auto v4 = detail::ConcatEven(v3, v3); + return LowerHalf(LowerHalf(LowerHalf(v4))); +} + +HWY_API Vec32 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + const auto v3 = detail::ConcatEven(v2, v2); + return LowerHalf(LowerHalf(v3)); +} + +HWY_API Vec64 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + return LowerHalf(v2); +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + const auto v3 = detail::ConcatEven(v2, v2); + return LowerHalf(LowerHalf(v3)); +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + return LowerHalf(v2); +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + const auto v2 = detail::ConcatEven(v1, v1); + return LowerHalf(v2); +} + +// ------------------------------ MulEven (ConcatEven) + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { + const Full128 d; + int32x4_t a_packed = ConcatEven(d, a, a).raw; + int32x4_t b_packed = ConcatEven(d, b, b).raw; + return Vec128( + vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); +} +HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { + const Full128 d; + uint32x4_t a_packed = ConcatEven(d, a, a).raw; + uint32x4_t b_packed = ConcatEven(d, b, b).raw; + return Vec128( + vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); +} + +template +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + const DFromV d; + int32x2_t a_packed = ConcatEven(d, a, a).raw; + int32x2_t b_packed = ConcatEven(d, b, b).raw; + return Vec128( + vget_low_s64(vmull_s32(a_packed, b_packed))); +} +template +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + const DFromV d; + uint32x2_t a_packed = ConcatEven(d, a, a).raw; + uint32x2_t b_packed = ConcatEven(d, b, b).raw; + return Vec128( + vget_low_u64(vmull_u32(a_packed, b_packed))); +} + +HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { + uint64_t hi; + uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi); + return Vec128(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); +} + +HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { + uint64_t hi; + uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi); + return Vec128(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); +} + +// ------------------------------ TableLookupBytes (Combine, LowerHalf) + +// Both full +template +HWY_API Vec128 TableLookupBytes(const Vec128 bytes, + const Vec128 from) { + const Full128 d; + const Repartition d8; +#if HWY_ARCH_ARM_A64 + return BitCast(d, Vec128(vqtbl1q_u8(BitCast(d8, bytes).raw, + BitCast(d8, from).raw))); +#else + uint8x16_t table0 = BitCast(d8, bytes).raw; + uint8x8x2_t table; + table.val[0] = vget_low_u8(table0); + table.val[1] = vget_high_u8(table0); + uint8x16_t idx = BitCast(d8, from).raw; + uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx)); + uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx)); + return BitCast(d, Vec128(vcombine_u8(low, hi))); +#endif +} + +// Partial index vector +template +HWY_API Vec128 TableLookupBytes(const Vec128 bytes, + const Vec128 from) { + const Full128 d_full; + const Vec64 from64(from.raw); + const auto idx_full = Combine(d_full, from64, from64); + const auto out_full = TableLookupBytes(bytes, idx_full); + return Vec128(LowerHalf(Half(), out_full).raw); +} + +// Partial table vector +template +HWY_API Vec128 TableLookupBytes(const Vec128 bytes, + const Vec128 from) { + const Full128 d_full; + return TableLookupBytes(Combine(d_full, bytes, bytes), from); +} + +// Partial both +template +HWY_API VFromD>> TableLookupBytes( + Vec128 bytes, Vec128 from) { + const Simd d; + const Simd d_idx; + const Repartition d_idx8; + // uint8x8 + const auto bytes8 = BitCast(Repartition(), bytes); + const auto from8 = BitCast(d_idx8, from); + const VFromD v8(vtbl1_u8(bytes8.raw, from8.raw)); + return BitCast(d_idx, v8); +} + +// For all vector widths; ARM anyway zeroes if >= 0x10. +template +HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { + return TableLookupBytes(bytes, from); +} + +// ------------------------------ Scatter (Store) + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, + T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Rebind(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Index index_lanes[N]; + Store(index, Rebind(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +// ------------------------------ Gather (Load/Store) + +template +HWY_API Vec128 GatherOffset(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Rebind(), offset_lanes); + + alignas(16) T lanes[N]; + const uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template +HWY_API Vec128 GatherIndex(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) Index index_lanes[N]; + Store(index, Rebind(), index_lanes); + + alignas(16) T lanes[N]; + for (size_t i = 0; i < N; ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +// ------------------------------ Reductions + +namespace detail { + +// N=1 for any T: no-op +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} + +// u32/i32/f32: N=2 +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return v10 + Shuffle2301(v10); +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Min(v10, Shuffle2301(v10)); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Max(v10, Shuffle2301(v10)); +} + +// full vectors +#if HWY_ARCH_ARM_A64 +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v) { + return Vec128(vdupq_n_u32(vaddvq_u32(v.raw))); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v) { + return Vec128(vdupq_n_s32(vaddvq_s32(v.raw))); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v) { + return Vec128(vdupq_n_f32(vaddvq_f32(v.raw))); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v) { + return Vec128(vdupq_n_u64(vaddvq_u64(v.raw))); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v) { + return Vec128(vdupq_n_s64(vaddvq_s64(v.raw))); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v) { + return Vec128(vdupq_n_f64(vaddvq_f64(v.raw))); +} +#else +// ARMv7 version for everything except doubles. +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v) { + uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw); + uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]); + uint32x4x2_t v1 = vuzpq_u32(c0, c0); + return Vec128(vaddq_u32(v1.val[0], v1.val[1])); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v) { + int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw); + int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]); + int32x4x2_t v1 = vuzpq_s32(c0, c0); + return Vec128(vaddq_s32(v1.val[0], v1.val[1])); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v) { + float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw); + float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]); + float32x4x2_t v1 = vuzpq_f32(c0, c0); + return Vec128(vaddq_f32(v1.val[0], v1.val[1])); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v) { + return v + Shuffle01(v); +} +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v) { + return v + Shuffle01(v); +} +#endif + +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Min(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Max(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +// For u64/i64[/f64]. +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Min(v10, v01); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Max(v10, v01); +} + +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} + +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +} // namespace detail + +template +HWY_API Vec128 SumOfLanes(Simd /* tag */, const Vec128 v) { + return detail::SumOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec128 MinOfLanes(Simd /* tag */, const Vec128 v) { + return detail::MinOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec128 MaxOfLanes(Simd /* tag */, const Vec128 v) { + return detail::MaxOfLanes(hwy::SizeTag(), v); +} + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +// Helper function to set 64 bits and potentially return a smaller vector. The +// overload is required to call the q vs non-q intrinsics. Note that 8-bit +// LoadMaskBits only requires 16 bits, but 64 avoids casting. +template +HWY_INLINE Vec128 Set64(Simd /* tag */, uint64_t mask_bits) { + const auto v64 = Vec64(vdup_n_u64(mask_bits)); + return Vec128(BitCast(Full64(), v64).raw); +} +template +HWY_INLINE Vec128 Set64(Full128 d, uint64_t mask_bits) { + return BitCast(d, Vec128(vdupq_n_u64(mask_bits))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + // Easier than Set(), which would require an >8-bit type, which would not + // compile for T=uint8_t, N=1. + const auto vmask_bits = Set64(du, mask_bits); + + // Replicate bytes 8x such that each byte contains the bit that governs it. + alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; + const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8)); + + alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + const auto vmask_bits = Set(du, static_cast(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8}; + const auto vmask_bits = Set(du, static_cast(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint64_t kBit[8] = {1, 2}; + return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask128 LoadMaskBits(Simd d, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + CopyBytes<(N + 7) / 8>(bits, &mask_bits); + return detail::LoadMaskBits(d, mask_bits); +} + +// ------------------------------ Mask + +namespace detail { + +// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than +// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse. +template +HWY_INLINE uint64_t NibblesFromMask(const Full128 d, Mask128 mask) { + const Full128 du16; + const Vec128 vu16 = BitCast(du16, VecFromMask(d, mask)); + const Vec64 nib(vshrn_n_u16(vu16.raw, 4)); + return GetLane(BitCast(Full64(), nib)); +} + +template +HWY_INLINE uint64_t NibblesFromMask(const Full64 d, Mask64 mask) { + // There is no vshrn_n_u16 for uint16x4, so zero-extend. + const Twice d2; + const Vec128 v128 = ZeroExtendVector(d2, VecFromMask(d, mask)); + // No need to mask, upper half is zero thanks to ZeroExtendVector. + return NibblesFromMask(d2, MaskFromVec(v128)); +} + +template +HWY_INLINE uint64_t NibblesFromMask(Simd /*d*/, Mask128 mask) { + const Mask64 mask64(mask.raw); + const uint64_t nib = NibblesFromMask(Full64(), mask64); + // Clear nibbles from upper half of 64-bits + constexpr size_t kBytes = sizeof(T) * N; + return nib & ((1ull << (kBytes * 4)) - 1); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { + alignas(16) constexpr uint8_t kSliceLanes[16] = { + 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, + }; + const Full128 du; + const Vec128 values = + BitCast(du, VecFromMask(Full128(), mask)) & Load(du, kSliceLanes); + +#if HWY_ARCH_ARM_A64 + // Can't vaddv - we need two separate bytes (16 bits). + const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); + const uint8x8_t x4 = vpadd_u8(x2, x2); + const uint8x8_t x8 = vpadd_u8(x4, x4); + return vget_lane_u64(vreinterpret_u64_u8(x8), 0); +#else + // Don't have vpaddq, so keep doubling lane size. + const uint16x8_t x2 = vpaddlq_u8(values.raw); + const uint32x4_t x4 = vpaddlq_u16(x2); + const uint64x2_t x8 = vpaddlq_u32(x4); + return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); +#endif +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { + // Upper lanes of partial loads are undefined. OnlyActive will fix this if + // we load all kSliceLanes so the upper lanes do not pollute the valid bits. + alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, + 0x10, 0x20, 0x40, 0x80}; + const Simd d; + const RebindToUnsigned du; + const Vec128 slice(Load(Full64(), kSliceLanes).raw); + const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; + +#if HWY_ARCH_ARM_A64 + return vaddv_u8(values.raw); +#else + const uint16x4_t x2 = vpaddl_u8(values.raw); + const uint32x2_t x4 = vpaddl_u16(x2); + const uint64x1_t x8 = vpaddl_u32(x4); + return vget_lane_u64(x8, 0); +#endif +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128 mask) { + alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8, + 0x10, 0x20, 0x40, 0x80}; + const Full128 d; + const Full128 du; + const Vec128 values = + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); +#if HWY_ARCH_ARM_A64 + return vaddvq_u16(values.raw); +#else + const uint32x4_t x2 = vpaddlq_u16(values.raw); + const uint64x2_t x4 = vpaddlq_u32(x2); + return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1); +#endif +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128 mask) { + // Upper lanes of partial loads are undefined. OnlyActive will fix this if + // we load all kSliceLanes so the upper lanes do not pollute the valid bits. + alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; + const Simd d; + const RebindToUnsigned du; + const Vec128 slice(Load(Full64(), kSliceLanes).raw); + const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; +#if HWY_ARCH_ARM_A64 + return vaddv_u16(values.raw); +#else + const uint32x2_t x2 = vpaddl_u16(values.raw); + const uint64x1_t x4 = vpaddl_u32(x2); + return vget_lane_u64(x4, 0); +#endif +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, + const Mask128 mask) { + alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; + const Full128 d; + const Full128 du; + const Vec128 values = + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); +#if HWY_ARCH_ARM_A64 + return vaddvq_u32(values.raw); +#else + const uint64x2_t x2 = vpaddlq_u32(values.raw); + return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1); +#endif +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, + const Mask128 mask) { + // Upper lanes of partial loads are undefined. OnlyActive will fix this if + // we load all kSliceLanes so the upper lanes do not pollute the valid bits. + alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2}; + const Simd d; + const RebindToUnsigned du; + const Vec128 slice(Load(Full64(), kSliceLanes).raw); + const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; +#if HWY_ARCH_ARM_A64 + return vaddv_u32(values.raw); +#else + const uint64x1_t x2 = vpaddl_u32(values.raw); + return vget_lane_u64(x2, 0); +#endif +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, const Mask128 m) { + alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2}; + const Full128 d; + const Full128 du; + const Vec128 values = + BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); +#if HWY_ARCH_ARM_A64 + return vaddvq_u64(values.raw); +#else + return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); +#endif +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, + const Mask128 m) { + const Full64 d; + const Full64 du; + const Vec64 values = BitCast(du, VecFromMask(d, m)) & Set(du, 1); + return vget_lane_u64(values.raw, 0); +} + +// Returns the lowest N for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); +} + +template +HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); +} + +// Returns number of lanes whose mask is set. +// +// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op +// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also +// changes each lane to 1 (if mask set) or 0. +// NOTE: PopCount also operates on vectors, so we still have to do horizontal +// sums separately. We specialize CountTrue for full vectors (negating instead +// of PopCount because it avoids an extra shift), and use PopCount of +// NibblesFromMask for partial vectors. + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { + const Full128 di; + const int8x16_t ones = + vnegq_s8(BitCast(di, VecFromMask(Full128(), mask)).raw); + +#if HWY_ARCH_ARM_A64 + return static_cast(vaddvq_s8(ones)); +#else + const int16x8_t x2 = vpaddlq_s8(ones); + const int32x4_t x4 = vpaddlq_s16(x2); + const int64x2_t x8 = vpaddlq_s32(x4); + return static_cast(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1)); +#endif +} +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128 mask) { + const Full128 di; + const int16x8_t ones = + vnegq_s16(BitCast(di, VecFromMask(Full128(), mask)).raw); + +#if HWY_ARCH_ARM_A64 + return static_cast(vaddvq_s16(ones)); +#else + const int32x4_t x2 = vpaddlq_s16(ones); + const int64x2_t x4 = vpaddlq_s32(x2); + return static_cast(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1)); +#endif +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 mask) { + const Full128 di; + const int32x4_t ones = + vnegq_s32(BitCast(di, VecFromMask(Full128(), mask)).raw); + +#if HWY_ARCH_ARM_A64 + return static_cast(vaddvq_s32(ones)); +#else + const int64x2_t x2 = vpaddlq_s32(ones); + return static_cast(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1)); +#endif +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 mask) { +#if HWY_ARCH_ARM_A64 + const Full128 di; + const int64x2_t ones = + vnegq_s64(BitCast(di, VecFromMask(Full128(), mask)).raw); + return static_cast(vaddvq_s64(ones)); +#else + const Full128 du; + const auto mask_u = VecFromMask(du, RebindMask(du, mask)); + const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); + return static_cast(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1)); +#endif +} + +} // namespace detail + +// Full +template +HWY_API size_t CountTrue(Full128 /* tag */, const Mask128 mask) { + return detail::CountTrue(hwy::SizeTag(), mask); +} + +// Partial +template +HWY_API size_t CountTrue(Simd d, const Mask128 mask) { + constexpr int kDiv = 4 * sizeof(T); + return PopCount(detail::NibblesFromMask(d, mask)) / kDiv; +} + +template +HWY_API size_t FindKnownFirstTrue(const Simd d, + const Mask128 mask) { + const uint64_t nib = detail::NibblesFromMask(d, mask); + constexpr size_t kDiv = 4 * sizeof(T); + return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv; +} + +template +HWY_API intptr_t FindFirstTrue(const Simd d, + const Mask128 mask) { + const uint64_t nib = detail::NibblesFromMask(d, mask); + if (nib == 0) return -1; + constexpr int kDiv = 4 * sizeof(T); + return static_cast(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv); +} + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(Simd /* tag */, const Mask128 mask, + uint8_t* bits) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const size_t kNumBytes = (N + 7) / 8; + CopyBytes(&mask_bits, bits); + return kNumBytes; +} + +template +HWY_API bool AllFalse(const Simd d, const Mask128 m) { + return detail::NibblesFromMask(d, m) == 0; +} + +// Full +template +HWY_API bool AllTrue(const Full128 d, const Mask128 m) { + return detail::NibblesFromMask(d, m) == ~0ull; +} +// Partial +template +HWY_API bool AllTrue(const Simd d, const Mask128 m) { + constexpr size_t kBytes = sizeof(T) * N; + return detail::NibblesFromMask(d, m) == (1ull << (kBytes * 4)) - 1; +} + +// ------------------------------ Compress + +template +struct CompressIsPartition { + enum { value = 1 }; +}; + +namespace detail { + +// Load 8 bytes, replicate into upper half so ZipLower can use the lower half. +HWY_INLINE Vec128 Load8Bytes(Full128 /*d*/, + const uint8_t* bytes) { + return Vec128(vreinterpretq_u8_u64( + vld1q_dup_u64(reinterpret_cast(bytes)))); +} + +// Load 8 bytes and return half-reg with N <= 8 bytes. +template +HWY_INLINE Vec128 Load8Bytes(Simd d, + const uint8_t* bytes) { + return Load(d, bytes); +} + +template +HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<2> /*tag*/, + const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd d; + const Repartition d8; + const Simd du; + + // ARM does not provide an equivalent of AVX2 permutevar, so we need byte + // indices for VTBL (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[256 * 8] = { + // PrintCompress16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // + 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // + 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // + 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // + 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // + 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // + 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // + 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // + 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // + 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // + 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // + 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // + 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // + 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // + 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // + 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // + 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // + 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // + 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // + 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // + 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // + 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // + 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // + 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // + 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // + 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // + 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // + 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // + 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // + 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // + 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // + 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // + 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // + 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // + 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // + 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // + 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // + 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // + 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // + 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // + 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // + 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // + 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // + 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // + 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // + 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // + 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // + 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // + 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // + 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // + 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // + 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // + 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // + 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // + 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // + 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // + 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // + 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // + 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // + 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // + 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // + 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // + 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // + 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // + 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // + 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // + 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // + 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // + 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // + 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // + 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // + 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // + 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // + 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // + 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // + 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // + 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // + 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // + 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // + 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // + 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // + 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // + 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // + 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // + 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // + 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // + 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // + 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // + 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // + 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // + 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // + 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // + 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // + 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // + 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // + 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // + 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // + 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // + 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // + 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // + 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // + 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // + 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // + 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // + 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // + 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // + 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // + 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // + 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // + 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // + 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // + 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // + 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // + 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // + 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // + 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // + 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // + 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // + 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // + 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // + 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<2> /*tag*/, + const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd d; + const Repartition d8; + const Simd du; + + // ARM does not provide an equivalent of AVX2 permutevar, so we need byte + // indices for VTBL (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[256 * 8] = { + // PrintCompressNot16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // + 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // + 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // + 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // + 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // + 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // + 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // + 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // + 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // + 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // + 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // + 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // + 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // + 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // + 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // + 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // + 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // + 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // + 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // + 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // + 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // + 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // + 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // + 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // + 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // + 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // + 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // + 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // + 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // + 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // + 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // + 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // + 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // + 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // + 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // + 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // + 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // + 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // + 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // + 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // + 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // + 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // + 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // + 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // + 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // + 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // + 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // + 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // + 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // + 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // + 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // + 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // + 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // + 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // + 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // + 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // + 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // + 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // + 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // + 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // + 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // + 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // + 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // + 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // + 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // + 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // + 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // + 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // + 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // + 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // + 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // + 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // + 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // + 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // + 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // + 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // + 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // + 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // + 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // + 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // + 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // + 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // + 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // + 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // + 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // + 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // + 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // + 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // + 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // + 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // + 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // + 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // + 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // + 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // + 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // + 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // + 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // + 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // + 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // + 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // + 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // + 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // + 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // + 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // + 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // + 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // + 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // + 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // + 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // + 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // + 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // + 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // + 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // + 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // + 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // + 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // + 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // + 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // + 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // + 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // + 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<4> /*tag*/, + const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompress32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // + 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // + 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<4> /*tag*/, + const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompressNot32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64 + +template +HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<8> /*tag*/, + const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[64] = { + // PrintCompress64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<8> /*tag*/, + const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[4 * 16] = { + // PrintCompressNot64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +#endif + +// Helper function called by both Compress and CompressStore - avoids a +// redundant BitsFromMask in the latter. +template +HWY_INLINE Vec128 Compress(Vec128 v, const uint64_t mask_bits) { + const auto idx = + detail::IdxFromBits(hwy::SizeTag(), mask_bits); + using D = Simd; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +template +HWY_INLINE Vec128 CompressNot(Vec128 v, const uint64_t mask_bits) { + const auto idx = + detail::IdxFromNotBits(hwy::SizeTag(), mask_bits); + using D = Simd; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +} // namespace detail + +// Single lane: no-op +template +HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { + return v; +} + +// Two lanes: conditional swap +template +HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { + // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. + const Simd d; + const Vec128 m = VecFromMask(d, mask); + const Vec128 maskL = DupEven(m); + const Vec128 maskH = DupOdd(m); + const Vec128 swap = AndNot(maskL, maskH); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case +template +HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { + return detail::Compress(v, detail::BitsFromMask(mask)); +} + +// Single lane: no-op +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { + return v; +} + +// Two lanes: conditional swap +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. + const Full128 d; + const Vec128 m = VecFromMask(d, mask); + const Vec128 maskL = DupEven(m); + const Vec128 maskH = DupOdd(m); + const Vec128 swap = AndNot(maskH, maskL); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + // For partial vectors, we cannot pull the Not() into the table because + // BitsFromMask clears the upper bits. + if (N < 16 / sizeof(T)) { + return detail::Compress(v, detail::BitsFromMask(Not(mask))); + } + return detail::CompressNot(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128 CompressBlocksNot(Vec128 v, + Mask128 /* m */) { + return v; +} + +// ------------------------------ CompressBits + +template +HWY_INLINE Vec128 CompressBits(Vec128 v, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::Compress(v, mask_bits); +} + +// ------------------------------ CompressStore +template +HWY_API size_t CompressStore(Vec128 v, const Mask128 mask, + Simd d, T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + StoreU(detail::Compress(v, mask_bits), d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(Vec128 v, Mask128 m, + Simd d, + T* HWY_RESTRICT unaligned) { + const RebindToUnsigned du; // so we can support fp16/bf16 + using TU = TFromD; + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + const Mask128 store_mask = RebindMask(d, FirstN(du, count)); + const Vec128 compressed = detail::Compress(BitCast(du, v), mask_bits); + BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); + return count; +} + +// ------------------------------ CompressBitsStore + +template +HWY_API size_t CompressBitsStore(Vec128 v, + const uint8_t* HWY_RESTRICT bits, + Simd d, T* HWY_RESTRICT unaligned) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + StoreU(detail::Compress(v, mask_bits), d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ LoadInterleaved2 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT +#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from + +#if HWY_ARCH_ARM_A64 +#define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N) +#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES +#else +// Exclude 64x2 and f64x1, which are only supported on aarch64 +#define HWY_IF_LOAD_INT(T, N) \ + hwy::EnableIf= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr +#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) +#endif // HWY_ARCH_ARM_A64 + +// Must return raw tuple because Tuple2 lack a ctor, and we cannot use +// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return +// void. +#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ + decltype(Tuple2().raw) +// Tuple tag arg allows overloading (cannot just overload on return type) +#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ + const type##_t *from, Tuple2 +HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT) +#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT +#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT + +#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ + decltype(Tuple3().raw) +#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ + const type##_t *from, Tuple3 +HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT +#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT + +#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ + decltype(Tuple4().raw) +#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ + const type##_t *from, Tuple4 +HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT +#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT + +#undef HWY_NEON_DEF_FUNCTION_LOAD_INT +#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT +#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT +} // namespace detail + +template +HWY_API void LoadInterleaved2(Simd /*tag*/, + const T* HWY_RESTRICT unaligned, Vec128& v0, + Vec128& v1) { + auto raw = detail::LoadInterleaved2(unaligned, detail::Tuple2()); + v0 = Vec128(raw.val[0]); + v1 = Vec128(raw.val[1]); +} + +// <= 32 bits: avoid loading more than N bytes by copying to buffer +template +HWY_API void LoadInterleaved2(Simd /*tag*/, + const T* HWY_RESTRICT unaligned, Vec128& v0, + Vec128& v1) { + // The smallest vector registers are 64-bits and we want space for two. + alignas(16) T buf[2 * 8 / sizeof(T)] = {}; + CopyBytes(unaligned, buf); + auto raw = detail::LoadInterleaved2(buf, detail::Tuple2()); + v0 = Vec128(raw.val[0]); + v1 = Vec128(raw.val[1]); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template +HWY_API void LoadInterleaved2(Full128 d, T* HWY_RESTRICT unaligned, + Vec128& v0, Vec128& v1) { + const Half dh; + VFromD v00, v10, v01, v11; + LoadInterleaved2(dh, unaligned, v00, v10); + LoadInterleaved2(dh, unaligned + 2, v01, v11); + v0 = Combine(d, v01, v00); + v1 = Combine(d, v11, v10); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ LoadInterleaved3 + +template +HWY_API void LoadInterleaved3(Simd /*tag*/, + const T* HWY_RESTRICT unaligned, Vec128& v0, + Vec128& v1, Vec128& v2) { + auto raw = detail::LoadInterleaved3(unaligned, detail::Tuple3()); + v0 = Vec128(raw.val[0]); + v1 = Vec128(raw.val[1]); + v2 = Vec128(raw.val[2]); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template +HWY_API void LoadInterleaved3(Simd /*tag*/, + const T* HWY_RESTRICT unaligned, Vec128& v0, + Vec128& v1, Vec128& v2) { + // The smallest vector registers are 64-bits and we want space for three. + alignas(16) T buf[3 * 8 / sizeof(T)] = {}; + CopyBytes(unaligned, buf); + auto raw = detail::LoadInterleaved3(buf, detail::Tuple3()); + v0 = Vec128(raw.val[0]); + v1 = Vec128(raw.val[1]); + v2 = Vec128(raw.val[2]); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template +HWY_API void LoadInterleaved3(Full128 d, const T* HWY_RESTRICT unaligned, + Vec128& v0, Vec128& v1, Vec128& v2) { + const Half dh; + VFromD v00, v10, v20, v01, v11, v21; + LoadInterleaved3(dh, unaligned, v00, v10, v20); + LoadInterleaved3(dh, unaligned + 3, v01, v11, v21); + v0 = Combine(d, v01, v00); + v1 = Combine(d, v11, v10); + v2 = Combine(d, v21, v20); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ LoadInterleaved4 + +template +HWY_API void LoadInterleaved4(Simd /*tag*/, + const T* HWY_RESTRICT unaligned, Vec128& v0, + Vec128& v1, Vec128& v2, + Vec128& v3) { + auto raw = detail::LoadInterleaved4(unaligned, detail::Tuple4()); + v0 = Vec128(raw.val[0]); + v1 = Vec128(raw.val[1]); + v2 = Vec128(raw.val[2]); + v3 = Vec128(raw.val[3]); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template +HWY_API void LoadInterleaved4(Simd /*tag*/, + const T* HWY_RESTRICT unaligned, Vec128& v0, + Vec128& v1, Vec128& v2, + Vec128& v3) { + alignas(16) T buf[4 * 8 / sizeof(T)] = {}; + CopyBytes(unaligned, buf); + auto raw = detail::LoadInterleaved4(buf, detail::Tuple4()); + v0 = Vec128(raw.val[0]); + v1 = Vec128(raw.val[1]); + v2 = Vec128(raw.val[2]); + v3 = Vec128(raw.val[3]); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template +HWY_API void LoadInterleaved4(Full128 d, const T* HWY_RESTRICT unaligned, + Vec128& v0, Vec128& v1, Vec128& v2, + Vec128& v3) { + const Half dh; + VFromD v00, v10, v20, v30, v01, v11, v21, v31; + LoadInterleaved4(dh, unaligned, v00, v10, v20, v30); + LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31); + v0 = Combine(d, v01, v00); + v1 = Combine(d, v11, v10); + v2 = Combine(d, v21, v20); + v3 = Combine(d, v31, v30); +} +#endif // HWY_ARCH_ARM_V7 + +#undef HWY_IF_LOAD_INT + +// ------------------------------ StoreInterleaved2 + +namespace detail { +#define HWY_NEON_BUILD_TPL_HWY_STORE_INT +#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void +#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw + +#if HWY_ARCH_ARM_A64 +#define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N) +#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES +#else +// Exclude 64x2 and f64x1, which are only supported on aarch64 +#define HWY_IF_STORE_INT(T, N) \ + hwy::EnableIf= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr +#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ + HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) +#endif // HWY_ARCH_ARM_A64 + +#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ + Tuple2 tup, type##_t *to +HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT + +#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ + Tuple3 tup, type##_t *to +HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT + +#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ + Tuple4 tup, type##_t *to +HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT) +#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT + +#undef HWY_NEON_DEF_FUNCTION_STORE_INT +#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT +#undef HWY_NEON_BUILD_RET_HWY_STORE_INT +#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT +} // namespace detail + +template +HWY_API void StoreInterleaved2(const Vec128 v0, const Vec128 v1, + Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + detail::Tuple2 tup = {{{v0.raw, v1.raw}}}; + detail::StoreInterleaved2(tup, unaligned); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template +HWY_API void StoreInterleaved2(const Vec128 v0, const Vec128 v1, + Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + alignas(16) T buf[2 * 8 / sizeof(T)]; + detail::Tuple2 tup = {{{v0.raw, v1.raw}}}; + detail::StoreInterleaved2(tup, buf); + CopyBytes(buf, unaligned); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template +HWY_API void StoreInterleaved2(const Vec128 v0, const Vec128 v1, + Full128 d, T* HWY_RESTRICT unaligned) { + const Half dh; + StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned); + StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ StoreInterleaved3 + +template +HWY_API void StoreInterleaved3(const Vec128 v0, const Vec128 v1, + const Vec128 v2, Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + detail::Tuple3 tup = {{{v0.raw, v1.raw, v2.raw}}}; + detail::StoreInterleaved3(tup, unaligned); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template +HWY_API void StoreInterleaved3(const Vec128 v0, const Vec128 v1, + const Vec128 v2, Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + alignas(16) T buf[3 * 8 / sizeof(T)]; + detail::Tuple3 tup = {{{v0.raw, v1.raw, v2.raw}}}; + detail::StoreInterleaved3(tup, buf); + CopyBytes(buf, unaligned); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template +HWY_API void StoreInterleaved3(const Vec128 v0, const Vec128 v1, + const Vec128 v2, Full128 d, + T* HWY_RESTRICT unaligned) { + const Half dh; + StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh, + unaligned); + StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh, + unaligned + 3); +} +#endif // HWY_ARCH_ARM_V7 + +// ------------------------------ StoreInterleaved4 + +template +HWY_API void StoreInterleaved4(const Vec128 v0, const Vec128 v1, + const Vec128 v2, const Vec128 v3, + Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + detail::Tuple4 tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; + detail::StoreInterleaved4(tup, unaligned); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template +HWY_API void StoreInterleaved4(const Vec128 v0, const Vec128 v1, + const Vec128 v2, const Vec128 v3, + Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + alignas(16) T buf[4 * 8 / sizeof(T)]; + detail::Tuple4 tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; + detail::StoreInterleaved4(tup, buf); + CopyBytes(buf, unaligned); +} + +#if HWY_ARCH_ARM_V7 +// 64x2: split into two 64x1 +template +HWY_API void StoreInterleaved4(const Vec128 v0, const Vec128 v1, + const Vec128 v2, const Vec128 v3, + Full128 d, T* HWY_RESTRICT unaligned) { + const Half dh; + StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), + LowerHalf(dh, v3), dh, unaligned); + StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), + UpperHalf(dh, v3), dh, unaligned + 4); +} +#endif // HWY_ARCH_ARM_V7 + +#undef HWY_IF_STORE_INT + +// ------------------------------ Lt128 + +template +HWY_INLINE Mask128 Lt128(Simd d, Vec128 a, + Vec128 b) { + static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); + // Truth table of Eq and Lt for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const Mask128 eqHL = Eq(a, b); + const Vec128 ltHL = VecFromMask(d, Lt(a, b)); + // We need to bring cL to the upper lane/bit corresponding to cH. Comparing + // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the + // comparison result leftwards requires only 4. IfThenElse compiles to the + // same code as OrAnd(). + const Vec128 ltLx = DupEven(ltHL); + const Vec128 outHx = IfThenElse(eqHL, ltLx, ltHL); + return MaskFromVec(DupOdd(outHx)); +} + +template +HWY_INLINE Mask128 Lt128Upper(Simd d, Vec128 a, + Vec128 b) { + const Vec128 ltHL = VecFromMask(d, Lt(a, b)); + return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); +} + +// ------------------------------ Eq128 + +template +HWY_INLINE Mask128 Eq128(Simd d, Vec128 a, + Vec128 b) { + static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); + const Vec128 eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); +} + +template +HWY_INLINE Mask128 Eq128Upper(Simd d, Vec128 a, + Vec128 b) { + const Vec128 eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); +} + +// ------------------------------ Ne128 + +template +HWY_INLINE Mask128 Ne128(Simd d, Vec128 a, + Vec128 b) { + static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); + const Vec128 neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(Or(Reverse2(d, neHL), neHL)); +} + +template +HWY_INLINE Mask128 Ne128Upper(Simd d, Vec128 a, + Vec128 b) { + const Vec128 neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(InterleaveUpper(d, neHL, neHL)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +// Without a native OddEven, it seems infeasible to go faster than Lt128. +template +HWY_INLINE VFromD Min128(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128(d, a, b), a, b); +} + +template +HWY_INLINE VFromD Max128(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128(d, b, a), a, b); +} + +template +HWY_INLINE VFromD Min128Upper(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template +HWY_INLINE VFromD Max128Upper(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +namespace detail { // for code folding +#if HWY_ARCH_ARM_V7 +#undef vuzp1_s8 +#undef vuzp1_u8 +#undef vuzp1_s16 +#undef vuzp1_u16 +#undef vuzp1_s32 +#undef vuzp1_u32 +#undef vuzp1_f32 +#undef vuzp1q_s8 +#undef vuzp1q_u8 +#undef vuzp1q_s16 +#undef vuzp1q_u16 +#undef vuzp1q_s32 +#undef vuzp1q_u32 +#undef vuzp1q_f32 +#undef vuzp2_s8 +#undef vuzp2_u8 +#undef vuzp2_s16 +#undef vuzp2_u16 +#undef vuzp2_s32 +#undef vuzp2_u32 +#undef vuzp2_f32 +#undef vuzp2q_s8 +#undef vuzp2q_u8 +#undef vuzp2q_s16 +#undef vuzp2q_u16 +#undef vuzp2q_s32 +#undef vuzp2q_u32 +#undef vuzp2q_f32 +#undef vzip1_s8 +#undef vzip1_u8 +#undef vzip1_s16 +#undef vzip1_u16 +#undef vzip1_s32 +#undef vzip1_u32 +#undef vzip1_f32 +#undef vzip1q_s8 +#undef vzip1q_u8 +#undef vzip1q_s16 +#undef vzip1q_u16 +#undef vzip1q_s32 +#undef vzip1q_u32 +#undef vzip1q_f32 +#undef vzip2_s8 +#undef vzip2_u8 +#undef vzip2_s16 +#undef vzip2_u16 +#undef vzip2_s32 +#undef vzip2_u32 +#undef vzip2_f32 +#undef vzip2q_s8 +#undef vzip2q_u8 +#undef vzip2q_s16 +#undef vzip2q_u16 +#undef vzip2q_s32 +#undef vzip2q_u32 +#undef vzip2q_f32 +#endif + +#undef HWY_NEON_BUILD_ARG_1 +#undef HWY_NEON_BUILD_ARG_2 +#undef HWY_NEON_BUILD_ARG_3 +#undef HWY_NEON_BUILD_PARAM_1 +#undef HWY_NEON_BUILD_PARAM_2 +#undef HWY_NEON_BUILD_PARAM_3 +#undef HWY_NEON_BUILD_RET_1 +#undef HWY_NEON_BUILD_RET_2 +#undef HWY_NEON_BUILD_RET_3 +#undef HWY_NEON_BUILD_TPL_1 +#undef HWY_NEON_BUILD_TPL_2 +#undef HWY_NEON_BUILD_TPL_3 +#undef HWY_NEON_DEF_FUNCTION +#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS +#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES +#undef HWY_NEON_DEF_FUNCTION_FLOAT_64 +#undef HWY_NEON_DEF_FUNCTION_INTS +#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS +#undef HWY_NEON_DEF_FUNCTION_INT_16 +#undef HWY_NEON_DEF_FUNCTION_INT_32 +#undef HWY_NEON_DEF_FUNCTION_INT_8 +#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32 +#undef HWY_NEON_DEF_FUNCTION_TPL +#undef HWY_NEON_DEF_FUNCTION_UIF81632 +#undef HWY_NEON_DEF_FUNCTION_UINTS +#undef HWY_NEON_DEF_FUNCTION_UINT_16 +#undef HWY_NEON_DEF_FUNCTION_UINT_32 +#undef HWY_NEON_DEF_FUNCTION_UINT_8 +#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 +#undef HWY_NEON_EVAL +} // namespace detail + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h new file mode 100644 index 0000000..1ccac9e --- /dev/null +++ b/hwy/ops/arm_sve-inl.h @@ -0,0 +1,3151 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ARM SVE[2] vectors (length not known at compile time). +// External include guard in highway.h - see comment there. + +#include +#include +#include + +#include "hwy/base.h" +#include "hwy/ops/shared-inl.h" + +// If running on hardware whose vector length is known to be a power of two, we +// can skip fixups for non-power of two sizes. +#undef HWY_SVE_IS_POW2 +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 +#define HWY_SVE_IS_POW2 1 +#else +#define HWY_SVE_IS_POW2 0 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +struct DFromV_t {}; // specialized in macros +template +using DFromV = typename DFromV_t>::type; + +template +using TFromV = TFromD>; + +// ================================================== MACROS + +// Generate specializations and function definitions using X macros. Although +// harder to read and debug, writing everything manually is too bulky. + +namespace detail { // for code folding + +// Unsigned: +#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP) +#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP) +#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ + X_MACRO(uint, u, 32, 16, NAME, OP) +#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \ + X_MACRO(uint, u, 64, 32, NAME, OP) + +// Signed: +#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP) +#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP) +#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP) +#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP) + +// Float: +#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \ + X_MACRO(float, f, 16, 16, NAME, OP) +#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ + X_MACRO(float, f, 32, 16, NAME, OP) +#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \ + X_MACRO(float, f, 64, 32, NAME, OP) + +// For all element sizes: +#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) + +// Commonly used type categories for a given element size: +#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) + +// Commonly used type categories: +#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) + +#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ + HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) + +// Assemble types for use in x-macros +#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t +#define HWY_SVE_D(BASE, BITS, N, POW2) Simd +#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t + +} // namespace detail + +#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template <> \ + struct DFromV_t { \ + using type = ScalableTag; \ + }; + +HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _) +#undef HWY_SPECIALIZE + +// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX +// instructions, and we anyway only use it when the predicate is ptrue. + +// vector = f(vector), e.g. Not +#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } +#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(v); \ + } + +// vector = f(vector, scalar), e.g. detail::AddN +#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \ + } +#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(a, b); \ + } + +// vector = f(vector, vector), e.g. Add +#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \ + } +#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(a, b); \ + } + +// ------------------------------ Lanes + +namespace detail { + +// Returns actual lanes of a hardware vector without rounding to a power of two. +HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<1> /* tag */) { + return svcntb_pat(SV_ALL); +} +HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<2> /* tag */) { + return svcnth_pat(SV_ALL); +} +HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<4> /* tag */) { + return svcntw_pat(SV_ALL); +} +HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<8> /* tag */) { + return svcntd_pat(SV_ALL); +} + +// All-true mask from a macro +#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL) + +#if HWY_SVE_IS_POW2 +#define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS) +#else +#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2) + +// Returns actual lanes of a hardware vector, rounded down to a power of two. +template +HWY_INLINE size_t HardwareLanes() { + return svcntb_pat(SV_POW2); +} +template +HWY_INLINE size_t HardwareLanes() { + return svcnth_pat(SV_POW2); +} +template +HWY_INLINE size_t HardwareLanes() { + return svcntw_pat(SV_POW2); +} +template +HWY_INLINE size_t HardwareLanes() { + return svcntd_pat(SV_POW2); +} + +#endif // HWY_SVE_IS_POW2 + +} // namespace detail + +// Returns actual number of lanes after capping by N and shifting. May return 0 +// (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8). +#if HWY_TARGET == HWY_SVE_256 +template +HWY_API constexpr size_t Lanes(Simd /* d */) { + return HWY_MIN(detail::ScaleByPower(32 / sizeof(T), kPow2), N); +} +#elif HWY_TARGET == HWY_SVE2_128 +template +HWY_API constexpr size_t Lanes(Simd /* d */) { + return HWY_MIN(detail::ScaleByPower(16 / sizeof(T), kPow2), N); +} +#else +template +HWY_API size_t Lanes(Simd d) { + const size_t actual = detail::HardwareLanes(); + // Common case of full vectors: avoid any extra instructions. + if (detail::IsFull(d)) return actual; + return HWY_MIN(detail::ScaleByPower(actual, kPow2), N); +} +#endif // HWY_TARGET + +// ================================================== MASK INIT + +// One mask bit per byte; only the one belonging to the lowest byte is valid. + +// ------------------------------ FirstN +#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \ + const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \ + return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast(limit)); \ + } +HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt) +#undef HWY_SVE_FIRSTN + +template +using MFromD = decltype(FirstN(D(), 0)); + +namespace detail { + +#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ + return HWY_SVE_PTRUE(BITS); \ + } \ + template \ + HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ + return HWY_SVE_ALL_PTRUE(BITS); \ + } + +HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true +#undef HWY_SVE_WRAP_PTRUE + +HWY_API svbool_t PFalse() { return svpfalse_b(); } + +// Returns all-true if d is HWY_FULL or FirstN(N) after capping N. +// +// This is used in functions that load/store memory; other functions (e.g. +// arithmetic) can ignore d and use PTrue instead. +template +svbool_t MakeMask(D d) { + return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d)); +} + +} // namespace detail + +// ================================================== INIT + +// ------------------------------ Set +// vector = f(d, scalar), e.g. Set +#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_T(BASE, BITS) arg) { \ + return sv##OP##_##CHAR##BITS(arg); \ + } + +HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n) +#undef HWY_SVE_SET + +// Required for Zero and VFromD +template +svuint16_t Set(Simd d, bfloat16_t arg) { + return Set(RebindToUnsigned(), arg.bits); +} + +template +using VFromD = decltype(Set(D(), TFromD())); + +// ------------------------------ Zero + +template +VFromD Zero(D d) { + // Cast to support bfloat16_t. + const RebindToUnsigned du; + return BitCast(d, Set(du, 0)); +} + +// ------------------------------ Undefined + +#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ + return sv##OP##_##CHAR##BITS(); \ + } + +HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef) + +// ------------------------------ BitCast + +namespace detail { + +// u8: no change +#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \ + return v; \ + } \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \ + HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \ + return v; \ + } + +// All other types +#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_u8_##CHAR##BITS(v); \ + } \ + template \ + HWY_INLINE HWY_SVE_V(BASE, BITS) \ + BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \ + return sv##OP##_##CHAR##BITS##_u8(v); \ + } + +HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _) +HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret) +HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret) + +#undef HWY_SVE_CAST_NOP +#undef HWY_SVE_CAST + +template +HWY_INLINE svuint16_t BitCastFromByte(Simd /* d */, + svuint8_t v) { + return BitCastFromByte(Simd(), v); +} + +} // namespace detail + +template +HWY_API VFromD BitCast(D d, FromV v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ================================================== LOGICAL + +// detail::*N() functions accept a scalar argument to avoid extra Set(). + +// ------------------------------ Not +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPV, Not, not ) // NOLINT + +// ------------------------------ And + +namespace detail { +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, AndN, and_n) +} // namespace detail + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, And, and) + +template +HWY_API V And(const V a, const V b) { + const DFromV df; + const RebindToUnsigned du; + return BitCast(df, And(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ Or + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr) + +template +HWY_API V Or(const V a, const V b) { + const DFromV df; + const RebindToUnsigned du; + return BitCast(df, Or(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ Xor + +namespace detail { +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, XorN, eor_n) +} // namespace detail + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Xor, eor) + +template +HWY_API V Xor(const V a, const V b) { + const DFromV df; + const RebindToUnsigned du; + return BitCast(df, Xor(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ AndNot + +namespace detail { +#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN_SWAP, AndNotN, bic_n) +#undef HWY_SVE_RETV_ARGPVN_SWAP +} // namespace detail + +#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \ + } +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV_SWAP, AndNot, bic) +#undef HWY_SVE_RETV_ARGPVV_SWAP + +template +HWY_API V AndNot(const V a, const V b) { + const DFromV df; + const RebindToUnsigned du; + return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b))); +} + +// ------------------------------ Or3 +template +HWY_API V Or3(V o1, V o2, V o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd +template +HWY_API V OrAnd(const V o, const V a1, const V a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ PopulationCount + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +// Need to return original type instead of unsigned. +#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return BitCast(DFromV(), \ + sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \ + } +HWY_SVE_FOREACH_UI(HWY_SVE_POPCNT, PopulationCount, cnt) +#undef HWY_SVE_POPCNT + +// ================================================== SIGN + +// ------------------------------ Neg +HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Neg, neg) + +// ------------------------------ Abs +HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs) + +// ------------------------------ CopySign[ToAbs] + +template +HWY_API V CopySign(const V magn, const V sign) { + const auto msb = SignBit(DFromV()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template +HWY_API V CopySignToAbs(const V abs, const V sign) { + const auto msb = SignBit(DFromV()); + return Or(abs, And(msb, sign)); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Add + +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN, AddN, add_n) +} // namespace detail + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Add, add) + +// ------------------------------ Sub + +namespace detail { +// Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg. +#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_z(pg, a, b); \ + } + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN_MASK, SubN, sub_n) +#undef HWY_SVE_RETV_ARGPVN_MASK +} // namespace detail + +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Sub, sub) + +// ------------------------------ SumsOf8 +HWY_API svuint64_t SumsOf8(const svuint8_t v) { + const ScalableTag du32; + const ScalableTag du64; + const svbool_t pg = detail::PTrue(du64); + + const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1); + // Compute pairwise sum of u32 and extend to u64. + // TODO(janwas): on SVE2, we can instead use svaddp. + const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32); + // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended) + const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4)); + return Add(hi, lo); +} + +// ------------------------------ SaturatedAdd + +HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGVV, SaturatedAdd, qadd) +HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGVV, SaturatedAdd, qadd) + +// ------------------------------ SaturatedSub + +HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGVV, SaturatedSub, qsub) +HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGVV, SaturatedSub, qsub) + +// ------------------------------ AbsDiff +HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPVV, AbsDiff, abd) + +// ------------------------------ ShiftLeft[Same] + +#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \ + } \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n) + +// ------------------------------ ShiftRight[Same] + +HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_N, ShiftRight, lsr_n) +HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n) + +#undef HWY_SVE_SHIFT_N + +// ------------------------------ RotateRight + +// TODO(janwas): svxar on SVE2 +template +HWY_API V RotateRight(const V v) { + constexpr size_t kSizeInBits = sizeof(TFromV) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +} + +// ------------------------------ Shl/r + +#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \ + const RebindToUnsigned> du; \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \ + BitCast(du, bits)); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT, Shl, lsl) + +HWY_SVE_FOREACH_U(HWY_SVE_SHIFT, Shr, lsr) +HWY_SVE_FOREACH_I(HWY_SVE_SHIFT, Shr, asr) + +#undef HWY_SVE_SHIFT + +// ------------------------------ Min/Max + +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Min, min) +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm) + +namespace detail { +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MinN, min_n) +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n) +} // namespace detail + +// ------------------------------ Mul +HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, Mul, mul) +HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul) + +// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*. +#ifdef HWY_NATIVE_I64MULLO +#undef HWY_NATIVE_I64MULLO +#else +#define HWY_NATIVE_I64MULLO +#endif + +// ------------------------------ MulHigh +HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) +// Not part of API, used internally: +HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) +HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) + +// ------------------------------ MulFixedPoint15 +HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) { +#if HWY_TARGET == HWY_SVE2 + return svqrdmulh_s16(a, b); +#else + const DFromV d; + const RebindToUnsigned du; + + const svuint16_t lo = BitCast(du, Mul(a, b)); + const svint16_t hi = MulHigh(a, b); + // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must + // carry that into the result. Instead isolate the top two bits because only + // they can influence the result. + const svuint16_t lo_top2 = ShiftRight<14>(lo); + // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0. + const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1)); + return Add(Add(hi, hi), BitCast(d, rounding)); +#endif +} + +// ------------------------------ Div +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div) + +// ------------------------------ ApproximateReciprocal +HWY_SVE_FOREACH_F32(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe) + +// ------------------------------ Sqrt +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt) + +// ------------------------------ ApproximateReciprocalSqrt +HWY_SVE_FOREACH_F32(HWY_SVE_RETV_ARGV, ApproximateReciprocalSqrt, rsqrte) + +// ------------------------------ MulAdd +#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \ + HWY_SVE_V(BASE, BITS) add) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \ + } + +HWY_SVE_FOREACH_F(HWY_SVE_FMA, MulAdd, mad) + +// ------------------------------ NegMulAdd +HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulAdd, msb) + +// ------------------------------ MulSub +HWY_SVE_FOREACH_F(HWY_SVE_FMA, MulSub, nmsb) + +// ------------------------------ NegMulSub +HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulSub, nmad) + +#undef HWY_SVE_FMA + +// ------------------------------ Round etc. + +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Round, rintn) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Floor, rintm) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Ceil, rintp) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Trunc, rintz) + +// ================================================== MASK + +// ------------------------------ RebindMask +template +HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) { + return mask; +} + +// ------------------------------ Mask logical + +HWY_API svbool_t Not(svbool_t m) { + // We don't know the lane type, so assume 8-bit. For larger types, this will + // de-canonicalize the predicate, i.e. set bits to 1 even though they do not + // correspond to the lowest byte in the lane. Per ARM, such bits are ignored. + return svnot_b_z(HWY_SVE_PTRUE(8), m); +} +HWY_API svbool_t And(svbool_t a, svbool_t b) { + return svand_b_z(b, b, a); // same order as AndNot for consistency +} +HWY_API svbool_t AndNot(svbool_t a, svbool_t b) { + return svbic_b_z(b, b, a); // reversed order like NEON +} +HWY_API svbool_t Or(svbool_t a, svbool_t b) { + return svsel_b(a, a, b); // a ? true : b +} +HWY_API svbool_t Xor(svbool_t a, svbool_t b) { + return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b. +} + +HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) { + return svnor_b_z(HWY_SVE_PTRUE(8), a, b); // !a && !b, undefined if a && b. +} + +// ------------------------------ CountTrue + +#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \ + return sv##OP##_b##BITS(detail::MakeMask(d), m); \ + } + +HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE, CountTrue, cntp) +#undef HWY_SVE_COUNT_TRUE + +// For 16-bit Compress: full vector, not limited to SV_POW2. +namespace detail { + +#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \ + return sv##OP##_b##BITS(svptrue_b##BITS(), m); \ + } + +HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp) +#undef HWY_SVE_COUNT_TRUE_FULL + +} // namespace detail + +// ------------------------------ AllFalse +template +HWY_API bool AllFalse(D d, svbool_t m) { + return !svptest_any(detail::MakeMask(d), m); +} + +// ------------------------------ AllTrue +template +HWY_API bool AllTrue(D d, svbool_t m) { + return CountTrue(d, m) == Lanes(d); +} + +// ------------------------------ FindFirstTrue +template +HWY_API intptr_t FindFirstTrue(D d, svbool_t m) { + return AllFalse(d, m) ? intptr_t{-1} + : static_cast( + CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m))); +} + +// ------------------------------ FindKnownFirstTrue +template +HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) { + return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)); +} + +// ------------------------------ IfThenElse +#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \ + return sv##OP##_##CHAR##BITS(m, yes, no); \ + } + +HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel) +#undef HWY_SVE_IF_THEN_ELSE + +// ------------------------------ IfThenElseZero +template +HWY_API V IfThenElseZero(const svbool_t mask, const V yes) { + return IfThenElse(mask, yes, Zero(DFromV())); +} + +// ------------------------------ IfThenZeroElse +template +HWY_API V IfThenZeroElse(const svbool_t mask, const V no) { + return IfThenElse(mask, Zero(DFromV()), no); +} + +// ================================================== COMPARE + +// mask = f(vector, vector) +#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \ + } +#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \ + } + +// ------------------------------ Eq +HWY_SVE_FOREACH(HWY_SVE_COMPARE, Eq, cmpeq) +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, EqN, cmpeq_n) +} // namespace detail + +// ------------------------------ Ne +HWY_SVE_FOREACH(HWY_SVE_COMPARE, Ne, cmpne) +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, NeN, cmpne_n) +} // namespace detail + +// ------------------------------ Lt +HWY_SVE_FOREACH(HWY_SVE_COMPARE, Lt, cmplt) +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LtN, cmplt_n) +} // namespace detail + +// ------------------------------ Le +HWY_SVE_FOREACH_F(HWY_SVE_COMPARE, Le, cmple) + +#undef HWY_SVE_COMPARE +#undef HWY_SVE_COMPARE_N + +// ------------------------------ Gt/Ge (swapped order) +template +HWY_API svbool_t Gt(const V a, const V b) { + return Lt(b, a); +} +template +HWY_API svbool_t Ge(const V a, const V b) { + return Le(b, a); +} + +// ------------------------------ TestBit +template +HWY_API svbool_t TestBit(const V a, const V bit) { + return detail::NeN(And(a, bit), 0); +} + +// ------------------------------ MaskFromVec (Ne) +template +HWY_API svbool_t MaskFromVec(const V v) { + return detail::NeN(v, static_cast>(0)); +} + +// ------------------------------ VecFromMask +template +HWY_API VFromD VecFromMask(const D d, svbool_t mask) { + const RebindToSigned di; + // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which + // requires an extra instruction plus M0 pipeline. + return BitCast(d, IfThenElseZero(mask, Set(di, -1))); +} + +// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse) + +#if HWY_TARGET == HWY_SVE2 + +#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \ + HWY_SVE_V(BASE, BITS) no) { \ + return sv##OP##_##CHAR##BITS(yes, no, mask); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_IF_VEC, IfVecThenElse, bsl) +#undef HWY_SVE_IF_VEC + +template +HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { + const DFromV d; + const RebindToUnsigned du; + return BitCast( + d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no))); +} + +#else + +template +HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { + return Or(And(mask, yes), AndNot(mask, no)); +} + +#endif // HWY_TARGET == HWY_SVE2 + +// ------------------------------ Floating-point classification (Ne) + +template +HWY_API svbool_t IsNaN(const V v) { + return Ne(v, v); // could also use cmpuo +} + +template +HWY_API svbool_t IsInf(const V v) { + using T = TFromV; + const DFromV d; + const RebindToSigned di; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2())); +} + +// Returns whether normal/subnormal/zero. +template +HWY_API svbool_t IsFinite(const V v) { + using T = TFromV; + const DFromV d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + const VFromD vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD exp = + BitCast(di, ShiftRight() + 1>(Add(vu, vu))); + return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField())); +} + +// ================================================== MEMORY + +// ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream + +#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \ + } + +#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + return sv##OP##_##CHAR##BITS(m, p); \ + } + +#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + /* All-true predicate to load all 128 bits. */ \ + return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \ + } + +#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \ + } + +#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \ + HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + sv##OP##_##CHAR##BITS(m, p, v); \ + } + +HWY_SVE_FOREACH(HWY_SVE_LOAD, Load, ld1) +HWY_SVE_FOREACH(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1) +HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDup128, ld1rq) +HWY_SVE_FOREACH(HWY_SVE_STORE, Store, st1) +HWY_SVE_FOREACH(HWY_SVE_STORE, Stream, stnt1) +HWY_SVE_FOREACH(HWY_SVE_BLENDED_STORE, BlendedStore, st1) + +#undef HWY_SVE_LOAD +#undef HWY_SVE_MASKED_LOAD +#undef HWY_SVE_LOAD_DUP128 +#undef HWY_SVE_STORE +#undef HWY_SVE_BLENDED_STORE + +// BF16 is the same as svuint16_t because BF16 is optional before v8.6. +template +HWY_API svuint16_t Load(Simd d, + const bfloat16_t* HWY_RESTRICT p) { + return Load(RebindToUnsigned(), + reinterpret_cast(p)); +} + +template +HWY_API void Store(svuint16_t v, Simd d, + bfloat16_t* HWY_RESTRICT p) { + Store(v, RebindToUnsigned(), + reinterpret_cast(p)); +} + +// ------------------------------ Load/StoreU + +// SVE only requires lane alignment, not natural alignment of the entire +// vector. +template +HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { + return Load(d, p); +} + +template +HWY_API void StoreU(const V v, D d, TFromD* HWY_RESTRICT p) { + Store(v, d, p); +} + +// ------------------------------ ScatterOffset/Index + +#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ + HWY_SVE_V(int, BITS) offset) { \ + sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \ + v); \ + } + +#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME( \ + HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \ + sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \ + } + +HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter) +HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_INDEX, ScatterIndex, st1_scatter) +#undef HWY_SVE_SCATTER_OFFSET +#undef HWY_SVE_SCATTER_INDEX + +// ------------------------------ GatherOffset/Index + +#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ + HWY_SVE_V(int, BITS) offset) { \ + return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \ + offset); \ + } +#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ + HWY_SVE_V(int, BITS) index) { \ + return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \ + index); \ + } + +HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather) +HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_INDEX, GatherIndex, ld1_gather) +#undef HWY_SVE_GATHER_OFFSET +#undef HWY_SVE_GATHER_INDEX + +// ------------------------------ LoadInterleaved2 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ + HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \ + const sv##BASE##BITS##x2_t tuple = \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ + v0 = svget2(tuple, 0); \ + v1 = svget2(tuple, 1); \ + } +HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2) + +#undef HWY_SVE_LOAD2 + +// ------------------------------ LoadInterleaved3 + +#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ + HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \ + HWY_SVE_V(BASE, BITS) & v2) { \ + const sv##BASE##BITS##x3_t tuple = \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ + v0 = svget3(tuple, 0); \ + v1 = svget3(tuple, 1); \ + v2 = svget3(tuple, 2); \ + } +HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3) + +#undef HWY_SVE_LOAD3 + +// ------------------------------ LoadInterleaved4 + +#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ + HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \ + HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \ + const sv##BASE##BITS##x4_t tuple = \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ + v0 = svget4(tuple, 0); \ + v1 = svget4(tuple, 1); \ + v2 = svget4(tuple, 2); \ + v3 = svget4(tuple, 3); \ + } +HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4) + +#undef HWY_SVE_LOAD4 + +// ------------------------------ StoreInterleaved2 + +#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ + const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple); \ + } +HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2) + +#undef HWY_SVE_STORE2 + +// ------------------------------ StoreInterleaved3 + +#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ + HWY_SVE_V(BASE, BITS) v2, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ + const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \ + } +HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3) + +#undef HWY_SVE_STORE3 + +// ------------------------------ StoreInterleaved4 + +#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ + HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \ + HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ + const sv##BASE##BITS##x4_t quad = \ + svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \ + sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \ + } +HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4) + +#undef HWY_SVE_STORE4 + +// ================================================== CONVERT + +// ------------------------------ PromoteTo + +// Same sign +#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) NAME( \ + HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \ + return sv##OP##_##CHAR##BITS(v); \ + } + +HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) +HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) +HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) + +// 2x +template +HWY_API svuint32_t PromoteTo(Simd dto, svuint8_t vfrom) { + const RepartitionToWide> d2; + return PromoteTo(dto, PromoteTo(d2, vfrom)); +} +template +HWY_API svint32_t PromoteTo(Simd dto, svint8_t vfrom) { + const RepartitionToWide> d2; + return PromoteTo(dto, PromoteTo(d2, vfrom)); +} + +// Sign change +template +HWY_API svint16_t PromoteTo(Simd dto, svuint8_t vfrom) { + const RebindToUnsigned du; + return BitCast(dto, PromoteTo(du, vfrom)); +} +template +HWY_API svint32_t PromoteTo(Simd dto, svuint16_t vfrom) { + const RebindToUnsigned du; + return BitCast(dto, PromoteTo(du, vfrom)); +} +template +HWY_API svint32_t PromoteTo(Simd dto, svuint8_t vfrom) { + const Repartition> du16; + const Repartition di16; + return PromoteTo(dto, BitCast(di16, PromoteTo(du16, vfrom))); +} + +// ------------------------------ PromoteTo F + +// Unlike Highway's ZipLower, this returns the same type. +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1) +} // namespace detail + +template +HWY_API svfloat32_t PromoteTo(Simd /* d */, + const svfloat16_t v) { + // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so + // first replicate each lane once. + const svfloat16_t vv = detail::ZipLowerSame(v, v); + return svcvt_f32_f16_x(detail::PTrue(Simd()), vv); +} + +template +HWY_API svfloat64_t PromoteTo(Simd /* d */, + const svfloat32_t v) { + const svfloat32_t vv = detail::ZipLowerSame(v, v); + return svcvt_f64_f32_x(detail::PTrue(Simd()), vv); +} + +template +HWY_API svfloat64_t PromoteTo(Simd /* d */, + const svint32_t v) { + const svint32_t vv = detail::ZipLowerSame(v, v); + return svcvt_f64_s32_x(detail::PTrue(Simd()), vv); +} + +// For 16-bit Compress +namespace detail { +HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi) +#undef HWY_SVE_PROMOTE_TO + +template +HWY_API svfloat32_t PromoteUpperTo(Simd df, svfloat16_t v) { + const RebindToUnsigned du; + const RepartitionToNarrow dn; + return BitCast(df, PromoteUpperTo(du, BitCast(dn, v))); +} + +} // namespace detail + +// ------------------------------ DemoteTo U + +namespace detail { + +// Saturates unsigned vectors to half/quarter-width TN. +template +VU SaturateU(VU v) { + return detail::MinN(v, static_cast>(LimitsMax())); +} + +// Saturates unsigned vectors to half/quarter-width TN. +template +VI SaturateI(VI v) { + return detail::MinN(detail::MaxN(v, LimitsMin()), LimitsMax()); +} + +} // namespace detail + +template +HWY_API svuint8_t DemoteTo(Simd dn, const svint16_t v) { + const DFromV di; + const RebindToUnsigned du; + using TN = TFromD; + // First clamp negative numbers to zero and cast to unsigned. + const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0)); + // Saturate to unsigned-max and halve the width. + const svuint8_t vn = BitCast(dn, detail::SaturateU(clamped)); + return svuzp1_u8(vn, vn); +} + +template +HWY_API svuint16_t DemoteTo(Simd dn, const svint32_t v) { + const DFromV di; + const RebindToUnsigned du; + using TN = TFromD; + // First clamp negative numbers to zero and cast to unsigned. + const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0)); + // Saturate to unsigned-max and halve the width. + const svuint16_t vn = BitCast(dn, detail::SaturateU(clamped)); + return svuzp1_u16(vn, vn); +} + +template +HWY_API svuint8_t DemoteTo(Simd dn, const svint32_t v) { + const DFromV di; + const RebindToUnsigned du; + const RepartitionToNarrow d2; + using TN = TFromD; + // First clamp negative numbers to zero and cast to unsigned. + const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0)); + // Saturate to unsigned-max and quarter the width. + const svuint16_t cast16 = BitCast(d2, detail::SaturateU(clamped)); + const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16)); + return svuzp1_u8(x2, x2); +} + +HWY_API svuint8_t U8FromU32(const svuint32_t v) { + const DFromV du32; + const RepartitionToNarrow du16; + const RepartitionToNarrow du8; + + const svuint16_t cast16 = BitCast(du16, v); + const svuint16_t x2 = svuzp1_u16(cast16, cast16); + const svuint8_t cast8 = BitCast(du8, x2); + return svuzp1_u8(cast8, cast8); +} + +// ------------------------------ Truncations + +template +HWY_API svuint8_t TruncateTo(Simd /* tag */, + const svuint64_t v) { + const DFromV d; + const svuint8_t v1 = BitCast(d, v); + const svuint8_t v2 = svuzp1_u8(v1, v1); + const svuint8_t v3 = svuzp1_u8(v2, v2); + return svuzp1_u8(v3, v3); +} + +template +HWY_API svuint16_t TruncateTo(Simd /* tag */, + const svuint64_t v) { + const DFromV d; + const svuint16_t v1 = BitCast(d, v); + const svuint16_t v2 = svuzp1_u16(v1, v1); + return svuzp1_u16(v2, v2); +} + +template +HWY_API svuint32_t TruncateTo(Simd /* tag */, + const svuint64_t v) { + const DFromV d; + const svuint32_t v1 = BitCast(d, v); + return svuzp1_u32(v1, v1); +} + +template +HWY_API svuint8_t TruncateTo(Simd /* tag */, + const svuint32_t v) { + const DFromV d; + const svuint8_t v1 = BitCast(d, v); + const svuint8_t v2 = svuzp1_u8(v1, v1); + return svuzp1_u8(v2, v2); +} + +template +HWY_API svuint16_t TruncateTo(Simd /* tag */, + const svuint32_t v) { + const DFromV d; + const svuint16_t v1 = BitCast(d, v); + return svuzp1_u16(v1, v1); +} + +template +HWY_API svuint8_t TruncateTo(Simd /* tag */, + const svuint16_t v) { + const DFromV d; + const svuint8_t v1 = BitCast(d, v); + return svuzp1_u8(v1, v1); +} + +// ------------------------------ DemoteTo I + +template +HWY_API svint8_t DemoteTo(Simd dn, const svint16_t v) { +#if HWY_TARGET == HWY_SVE2 + const svint8_t vn = BitCast(dn, svqxtnb_s16(v)); +#else + using TN = TFromD; + const svint8_t vn = BitCast(dn, detail::SaturateI(v)); +#endif + return svuzp1_s8(vn, vn); +} + +template +HWY_API svint16_t DemoteTo(Simd dn, const svint32_t v) { +#if HWY_TARGET == HWY_SVE2 + const svint16_t vn = BitCast(dn, svqxtnb_s32(v)); +#else + using TN = TFromD; + const svint16_t vn = BitCast(dn, detail::SaturateI(v)); +#endif + return svuzp1_s16(vn, vn); +} + +template +HWY_API svint8_t DemoteTo(Simd dn, const svint32_t v) { + const RepartitionToWide d2; +#if HWY_TARGET == HWY_SVE2 + const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v))); +#else + using TN = TFromD; + const svint16_t cast16 = BitCast(d2, detail::SaturateI(v)); +#endif + const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16)); + return BitCast(dn, svuzp1_s8(v2, v2)); +} + +// ------------------------------ ConcatEven/ConcatOdd + +// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the +// full vector length, not rounded down to a power of two as we require). +namespace detail { + +#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ + return sv##OP##_##CHAR##BITS(lo, hi); \ + } +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2) +#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q) +#endif +#undef HWY_SVE_CONCAT_EVERY_SECOND + +// Used to slide up / shift whole register left; mask indicates which range +// to take from lo, and the rest is filled from hi starting at its lowest. +#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME( \ + HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, lo, hi); \ + } +HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice) +#undef HWY_SVE_SPLICE + +} // namespace detail + +template +HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { +#if HWY_SVE_IS_POW2 + (void)d; + return detail::ConcatOddFull(hi, lo); +#else + const VFromD hi_odd = detail::ConcatOddFull(hi, hi); + const VFromD lo_odd = detail::ConcatOddFull(lo, lo); + return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); +#endif +} + +template +HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { +#if HWY_SVE_IS_POW2 + (void)d; + return detail::ConcatEvenFull(hi, lo); +#else + const VFromD hi_odd = detail::ConcatEvenFull(hi, hi); + const VFromD lo_odd = detail::ConcatEvenFull(lo, lo); + return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); +#endif +} + +// ------------------------------ DemoteTo F + +template +HWY_API svfloat16_t DemoteTo(Simd d, const svfloat32_t v) { + const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v); + return detail::ConcatEvenFull(in_even, + in_even); // lower half +} + +template +HWY_API svuint16_t DemoteTo(Simd /* d */, svfloat32_t v) { + const svuint16_t in_even = BitCast(ScalableTag(), v); + return detail::ConcatOddFull(in_even, in_even); // lower half +} + +template +HWY_API svfloat32_t DemoteTo(Simd d, const svfloat64_t v) { + const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v); + return detail::ConcatEvenFull(in_even, + in_even); // lower half +} + +template +HWY_API svint32_t DemoteTo(Simd d, const svfloat64_t v) { + const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v); + return detail::ConcatEvenFull(in_even, + in_even); // lower half +} + +// ------------------------------ ConvertTo F + +#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \ + /* signed integers */ \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } \ + /* unsigned integers */ \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } \ + /* Truncates (rounds toward zero). */ \ + template \ + HWY_API HWY_SVE_V(int, BITS) \ + NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } + +// API only requires f32 but we provide f64 for use by Iota. +HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt) +#undef HWY_SVE_CONVERT + +// ------------------------------ NearestInt (Round, ConvertTo) +template >> +HWY_API VFromD NearestInt(VF v) { + // No single instruction, round then truncate. + return ConvertTo(DI(), Round(v)); +} + +// ------------------------------ Iota (Add, ConvertTo) + +#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_T(BASE, BITS) first) { \ + return sv##OP##_##CHAR##BITS(first, 1); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index) +#undef HWY_SVE_IOTA + +template +HWY_API VFromD Iota(const D d, TFromD first) { + const RebindToSigned di; + return detail::AddN(ConvertTo(d, Iota(di, 0)), first); +} + +// ------------------------------ InterleaveLower + +template +HWY_API V InterleaveLower(D d, const V a, const V b) { + static_assert(IsSame, TFromV>(), "D/V mismatch"); +#if HWY_TARGET == HWY_SVE2_128 + (void)d; + return detail::ZipLowerSame(a, b); +#else + // Move lower halves of blocks to lower half of vector. + const Repartition d64; + const auto a64 = BitCast(d64, a); + const auto b64 = BitCast(d64, b); + const auto a_blocks = detail::ConcatEvenFull(a64, a64); // lower half + const auto b_blocks = detail::ConcatEvenFull(b64, b64); + return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks)); +#endif +} + +template +HWY_API V InterleaveLower(const V a, const V b) { + return InterleaveLower(DFromV(), a, b); +} + +// ------------------------------ InterleaveUpper + +// Only use zip2 if vector are a powers of two, otherwise getting the actual +// "upper half" requires MaskUpperHalf. +#if HWY_TARGET == HWY_SVE2_128 +namespace detail { +// Unlike Highway's ZipUpper, this returns the same type. +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2) +} // namespace detail +#endif + +// Full vector: guaranteed to have at least one block +template , + hwy::EnableIf* = nullptr> +HWY_API V InterleaveUpper(D d, const V a, const V b) { +#if HWY_TARGET == HWY_SVE2_128 + (void)d; + return detail::ZipUpperSame(a, b); +#else + // Move upper halves of blocks to lower half of vector. + const Repartition d64; + const auto a64 = BitCast(d64, a); + const auto b64 = BitCast(d64, b); + const auto a_blocks = detail::ConcatOddFull(a64, a64); // lower half + const auto b_blocks = detail::ConcatOddFull(b64, b64); + return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks)); +#endif +} + +// Capped/fraction: need runtime check +template , + hwy::EnableIf* = nullptr> +HWY_API V InterleaveUpper(D d, const V a, const V b) { + // Less than one block: treat as capped + if (Lanes(d) * sizeof(TFromD) < 16) { + const Half d2; + return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b)); + } + return InterleaveUpper(DFromV(), a, b); +} + +// ================================================== COMBINE + +namespace detail { + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +template +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 32: + return svptrue_pat_b8(SV_VL16); + case 16: + return svptrue_pat_b8(SV_VL8); + case 8: + return svptrue_pat_b8(SV_VL4); + case 4: + return svptrue_pat_b8(SV_VL2); + default: + return svptrue_pat_b8(SV_VL1); + } +} +template +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 16: + return svptrue_pat_b16(SV_VL8); + case 8: + return svptrue_pat_b16(SV_VL4); + case 4: + return svptrue_pat_b16(SV_VL2); + default: + return svptrue_pat_b16(SV_VL1); + } +} +template +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 8: + return svptrue_pat_b32(SV_VL4); + case 4: + return svptrue_pat_b32(SV_VL2); + default: + return svptrue_pat_b32(SV_VL1); + } +} +template +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 4: + return svptrue_pat_b64(SV_VL2); + default: + return svptrue_pat_b64(SV_VL1); + } +} +#endif +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE +template +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 16: + return svptrue_pat_b8(SV_VL8); + case 8: + return svptrue_pat_b8(SV_VL4); + case 4: + return svptrue_pat_b8(SV_VL2); + case 2: + case 1: + default: + return svptrue_pat_b8(SV_VL1); + } +} +template +svbool_t MaskLowerHalf(D d) { + switch (Lanes(d)) { + case 8: + return svptrue_pat_b16(SV_VL4); + case 4: + return svptrue_pat_b16(SV_VL2); + case 2: + case 1: + default: + return svptrue_pat_b16(SV_VL1); + } +} +template +svbool_t MaskLowerHalf(D d) { + return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1); +} +template +svbool_t MaskLowerHalf(D /*d*/) { + return svptrue_pat_b64(SV_VL1); +} +#endif // HWY_TARGET == HWY_SVE2_128 +#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128 +template +svbool_t MaskLowerHalf(D d) { + return FirstN(d, Lanes(d) / 2); +} +#endif + +template +svbool_t MaskUpperHalf(D d) { + // TODO(janwas): WHILEGE on pow2 SVE2 + if (HWY_SVE_IS_POW2 && IsFull(d)) { + return Not(MaskLowerHalf(d)); + } + + // For Splice to work as intended, make sure bits above Lanes(d) are zero. + return AndNot(MaskLowerHalf(d), detail::MakeMask(d)); +} + +// Right-shift vector pair by constexpr; can be used to slide down (=N) or up +// (=Lanes()-N). +#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ + return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \ + } +HWY_SVE_FOREACH(HWY_SVE_EXT, Ext, ext) +#undef HWY_SVE_EXT + +} // namespace detail + +// ------------------------------ ConcatUpperLower +template +HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) { + return IfThenElse(detail::MaskLowerHalf(d), lo, hi); +} + +// ------------------------------ ConcatLowerLower +template +HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) { + if (detail::IsFull(d)) { +#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256 + return detail::ConcatEvenBlocks(hi, lo); +#endif +#if HWY_TARGET == HWY_SVE2_128 + const Repartition du64; + const auto lo64 = BitCast(du64, lo); + return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi))); +#endif + } + return detail::Splice(hi, lo, detail::MaskLowerHalf(d)); +} + +// ------------------------------ ConcatLowerUpper +template +HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) { +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes + if (detail::IsFull(d)) { + return detail::Ext(hi, lo); + } +#endif + return detail::Splice(hi, lo, detail::MaskUpperHalf(d)); +} + +// ------------------------------ ConcatUpperUpper +template +HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) { + if (detail::IsFull(d)) { +#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256 + return detail::ConcatOddBlocks(hi, lo); +#endif +#if HWY_TARGET == HWY_SVE2_128 + const Repartition du64; + const auto lo64 = BitCast(du64, lo); + return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi))); +#endif + } + const svbool_t mask_upper = detail::MaskUpperHalf(d); + const V lo_upper = detail::Splice(lo, lo, mask_upper); + return IfThenElse(mask_upper, hi, lo_upper); +} + +// ------------------------------ Combine +template +HWY_API VFromD Combine(const D d, const V2 hi, const V2 lo) { + return ConcatLowerLower(d, hi, lo); +} + +// ------------------------------ ZeroExtendVector +template +HWY_API V ZeroExtendVector(const D d, const V lo) { + return Combine(d, Zero(Half()), lo); +} + +// ------------------------------ Lower/UpperHalf + +template +HWY_API V LowerHalf(D2 /* tag */, const V v) { + return v; +} + +template +HWY_API V LowerHalf(const V v) { + return v; +} + +template +HWY_API V UpperHalf(const DH dh, const V v) { + const Twice d; + // Cast so that we support bfloat16_t. + const RebindToUnsigned du; + const VFromD vu = BitCast(du, v); +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes + return BitCast(d, detail::Ext(vu, vu)); +#else + const MFromD mask = detail::MaskUpperHalf(du); + return BitCast(d, detail::Splice(vu, vu, mask)); +#endif +} + +// ================================================== REDUCE + +// These return T, whereas the Highway op returns a broadcasted vector. +namespace detail { +#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ + /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \ + using T = HWY_SVE_T(BASE, BITS); \ + using TU = MakeUnsigned; \ + constexpr uint64_t kMask = LimitsMax(); \ + return static_cast(static_cast( \ + static_cast(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \ + } + +#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(pg, v); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv) +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv) + +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv) +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv) +// NaN if all are +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv) +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv) + +#undef HWY_SVE_REDUCE +#undef HWY_SVE_REDUCE_ADD +} // namespace detail + +template +V SumOfLanes(D d, V v) { + return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v)); +} + +template +V MinOfLanes(D d, V v) { + return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v)); +} + +template +V MaxOfLanes(D d, V v) { + return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v)); +} + + +// ================================================== SWIZZLE + +// ------------------------------ GetLane + +namespace detail { +#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE HWY_SVE_T(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, v); \ + } + +HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta) +#undef HWY_SVE_GET_LANE +} // namespace detail + +template +HWY_API TFromV GetLane(V v) { + return detail::GetLaneM(v, detail::PFalse()); +} + +// ------------------------------ ExtractLane +template +HWY_API TFromV ExtractLane(V v, size_t i) { + return detail::GetLaneM(v, FirstN(DFromV(), i)); +} + +// ------------------------------ InsertLane (IfThenElse) +template +HWY_API V InsertLane(const V v, size_t i, TFromV t) { + const DFromV d; + const auto is_i = detail::EqN(Iota(d, 0), static_cast>(i)); + return IfThenElse(RebindMask(d, is_i), Set(d, t), v); +} + +// ------------------------------ DupEven + +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1) +} // namespace detail + +template +HWY_API V DupEven(const V v) { + return detail::InterleaveEven(v, v); +} + +// ------------------------------ DupOdd + +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2) +} // namespace detail + +template +HWY_API V DupOdd(const V v) { + return detail::InterleaveOdd(v, v); +} + +// ------------------------------ OddEven + +#if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE2 + +#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \ + return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_ODD_EVEN, OddEven, eortb_n) +#undef HWY_SVE_ODD_EVEN + +template +HWY_API V OddEven(const V odd, const V even) { + const DFromV d; + const RebindToUnsigned du; + return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even))); +} + +#else + +template +HWY_API V OddEven(const V odd, const V even) { + const auto odd_in_even = detail::Ext<1>(odd, odd); + return detail::InterleaveEven(even, odd_in_even); +} + +#endif // HWY_TARGET + +// ------------------------------ OddEvenBlocks +template +HWY_API V OddEvenBlocks(const V odd, const V even) { + const DFromV d; +#if HWY_TARGET == HWY_SVE_256 + return ConcatUpperLower(d, odd, even); +#elif HWY_TARGET == HWY_SVE2_128 + (void)odd; + (void)d; + return even; +#else + const RebindToUnsigned du; + using TU = TFromD; + constexpr size_t kShift = CeilLog2(16 / sizeof(TU)); + const auto idx_block = ShiftRight(Iota(du, 0)); + const auto lsb = detail::AndN(idx_block, static_cast(1)); + const svbool_t is_even = detail::EqN(lsb, static_cast(0)); + return IfThenElse(is_even, even, odd); +#endif +} + +// ------------------------------ TableLookupLanes + +template +HWY_API VFromD> IndicesFromVec(D d, VI vec) { + using TI = TFromV; + static_assert(sizeof(TFromD) == sizeof(TI), "Index/lane size mismatch"); + const RebindToUnsigned du; + const auto indices = BitCast(du, vec); +#if HWY_IS_DEBUG_BUILD + HWY_DASSERT(AllTrue(du, detail::LtN(indices, static_cast(Lanes(d))))); +#else + (void)d; +#endif + return indices; +} + +template +HWY_API VFromD> SetTableIndices(D d, const TI* idx) { + static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); + return IndicesFromVec(d, LoadU(Rebind(), idx)); +} + +// <32bit are not part of Highway API, but used in Broadcast. +#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \ + return sv##OP##_##CHAR##BITS(v, idx); \ + } + +HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl) +#undef HWY_SVE_TABLE + +// ------------------------------ SwapAdjacentBlocks (TableLookupLanes) + +namespace detail { + +template +constexpr size_t LanesPerBlock(Simd /* tag */) { + // We might have a capped vector smaller than a block, so honor that. + return HWY_MIN(16 / sizeof(T), detail::ScaleByPower(N, kPow2)); +} + +} // namespace detail + +template +HWY_API V SwapAdjacentBlocks(const V v) { + const DFromV d; +#if HWY_TARGET == HWY_SVE_256 + return ConcatLowerUpper(d, v, v); +#elif HWY_TARGET == HWY_SVE2_128 + (void)d; + return v; +#else + const RebindToUnsigned du; + constexpr auto kLanesPerBlock = + static_cast>(detail::LanesPerBlock(d)); + const VFromD idx = detail::XorN(Iota(du, 0), kLanesPerBlock); + return TableLookupLanes(v, idx); +#endif +} + +// ------------------------------ Reverse + +namespace detail { + +#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(v); \ + } + +HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev) +#undef HWY_SVE_REVERSE + +} // namespace detail + +template +HWY_API V Reverse(D d, V v) { + using T = TFromD; + const auto reversed = detail::ReverseFull(v); + if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed; + // Shift right to remove extra (non-pow2 and remainder) lanes. + // TODO(janwas): on SVE2, use WHILEGE. + // Avoids FirstN truncating to the return vector size. Must also avoid Not + // because that is limited to SV_POW2. + const ScalableTag dfull; + const svbool_t all_true = detail::AllPTrue(dfull); + const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag()); + const svbool_t mask = + svnot_b_z(all_true, FirstN(dfull, all_lanes - Lanes(d))); + return detail::Splice(reversed, reversed, mask); +} + +// ------------------------------ Reverse2 + +template +HWY_API VFromD Reverse2(D d, const VFromD v) { + const RebindToUnsigned du; + const RepartitionToWide dw; + return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v))); +} + +template +HWY_API VFromD Reverse2(D d, const VFromD v) { + const RebindToUnsigned du; + const RepartitionToWide dw; + return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v))); +} + +template +HWY_API VFromD Reverse2(D d, const VFromD v) { // 3210 +#if HWY_TARGET == HWY_SVE2_128 + if (detail::IsFull(d)) { + return detail::Ext<1>(v, v); + } +#endif + (void)d; + const auto odd_in_even = detail::Ext<1>(v, v); // x321 + return detail::InterleaveEven(odd_in_even, v); // 2301 +} +// ------------------------------ Reverse4 (TableLookupLanes) +template +HWY_API VFromD Reverse4(D d, const VFromD v) { + if (HWY_TARGET == HWY_SVE_256 && sizeof(TFromD) == 8 && + detail::IsFull(d)) { + return detail::ReverseFull(v); + } + // TODO(janwas): is this approach faster than Shuffle0123? + const RebindToUnsigned du; + const auto idx = detail::XorN(Iota(du, 0), 3); + return TableLookupLanes(v, idx); +} + +// ------------------------------ Reverse8 (TableLookupLanes) +template +HWY_API VFromD Reverse8(D d, const VFromD v) { + const RebindToUnsigned du; + const auto idx = detail::XorN(Iota(du, 0), 7); + return TableLookupLanes(v, idx); +} + +// ------------------------------ Compress (PromoteTo) + +template +struct CompressIsPartition { +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 + // Optimization for 64-bit lanes (could also be applied to 32-bit, but that + // requires a larger table). + enum { value = (sizeof(T) == 8) }; +#else + enum { value = 0 }; +#endif // HWY_TARGET == HWY_SVE_256 +}; + +#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, v); \ + } + +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 +HWY_SVE_FOREACH_UI32(HWY_SVE_COMPRESS, Compress, compact) +HWY_SVE_FOREACH_F32(HWY_SVE_COMPRESS, Compress, compact) +#else +HWY_SVE_FOREACH_UIF3264(HWY_SVE_COMPRESS, Compress, compact) +#endif +#undef HWY_SVE_COMPRESS + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +template +HWY_API V Compress(V v, svbool_t mask) { + const DFromV d; + const RebindToUnsigned du64; + + // Convert mask into bitfield via horizontal sum (faster than ORV) of masked + // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for + // SetTableIndices. + const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); + const size_t offset = detail::SumOfLanesM(mask, bits); + + // See CompressIsPartition. + alignas(16) static constexpr uint64_t table[4 * 16] = { + // PrintCompress64x4Tables + 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2, + 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2, + 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3}; + return TableLookupLanes(v, SetTableIndices(d, table + offset)); +} +#endif // HWY_TARGET == HWY_SVE_256 +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE +template +HWY_API V Compress(V v, svbool_t mask) { + // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10 + // swaps upper/lower (the lower half is set to the upper half, and the + // remaining upper half is filled from the lower half of the second v), and + // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10 + // unchanged and map everything else to 00. + const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane + return detail::Splice(v, v, AndNot(maskLL, mask)); +} +#endif // HWY_TARGET == HWY_SVE_256 + +template +HWY_API V Compress(V v, svbool_t mask16) { + static_assert(!IsSame(), "Must use overload"); + const DFromV d16; + + // Promote vector and mask to 32-bit + const RepartitionToWide dw; + const auto v32L = PromoteTo(dw, v); + const auto v32H = detail::PromoteUpperTo(dw, v); + const svbool_t mask32L = svunpklo_b(mask16); + const svbool_t mask32H = svunpkhi_b(mask16); + + const auto compressedL = Compress(v32L, mask32L); + const auto compressedH = Compress(v32H, mask32H); + + // Demote to 16-bit (already in range) - separately so we can splice + const V evenL = BitCast(d16, compressedL); + const V evenH = BitCast(d16, compressedH); + const V v16L = detail::ConcatEvenFull(evenL, evenL); // lower half + const V v16H = detail::ConcatEvenFull(evenH, evenH); + + // We need to combine two vectors of non-constexpr length, so the only option + // is Splice, which requires us to synthesize a mask. NOTE: this function uses + // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt. + const size_t countL = detail::CountTrueFull(dw, mask32L); + const auto compressed_maskL = FirstN(d16, countL); + return detail::Splice(v16H, v16L, compressed_maskL); +} + +// Must treat float16_t as integers so we can ConcatEven. +HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) { + const DFromV df; + const RebindToSigned di; + return BitCast(df, Compress(BitCast(di, v), mask16)); +} + +// ------------------------------ CompressNot + +template +HWY_API V CompressNot(V v, const svbool_t mask) { + return Compress(v, Not(mask)); +} + +template +HWY_API V CompressNot(V v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE + // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10 + // swaps upper/lower (the lower half is set to the upper half, and the + // remaining upper half is filled from the lower half of the second v), and + // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map + // 01 to 10, and everything else to 00. + const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane + return detail::Splice(v, v, AndNot(mask, maskLL)); +#endif +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE + const DFromV d; + const RebindToUnsigned du64; + + // Convert mask into bitfield via horizontal sum (faster than ORV) of masked + // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for + // SetTableIndices. + const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); + const size_t offset = detail::SumOfLanesM(mask, bits); + + // See CompressIsPartition. + alignas(16) static constexpr uint64_t table[4 * 16] = { + // PrintCompressNot64x4Tables + 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3, + 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3, + 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; + return TableLookupLanes(v, SetTableIndices(d, table + offset)); +#endif // HWY_TARGET == HWY_SVE_256 + + return Compress(v, Not(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) { +#if HWY_TARGET == HWY_SVE2_128 + (void)mask; + return v; +#endif +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE + uint64_t bits = 0; // predicate reg is 32-bit + CopyBytes<4>(&mask, &bits); // not same size - 64-bit more efficient + // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx. + const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u); + // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1]. + alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1, + 0, 1, 2, 3, 0, 1, 2, 3}; + const ScalableTag d; + return TableLookupLanes(v, SetTableIndices(d, table + offset)); +#endif + + return CompressNot(v, mask); +} + +// ------------------------------ CompressStore +template +HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d, + TFromD* HWY_RESTRICT unaligned) { + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d, + TFromD* HWY_RESTRICT unaligned) { + const size_t count = CountTrue(d, mask); + const svbool_t store_mask = FirstN(d, count); + BlendedStore(Compress(v, mask), store_mask, d, unaligned); + return count; +} + +// ================================================== BLOCKWISE + +// ------------------------------ CombineShiftRightBytes + +// Prevent accidentally using these for 128-bit vectors - should not be +// necessary. +#if HWY_TARGET != HWY_SVE2_128 +namespace detail { + +// For x86-compatible behaviour mandated by Highway API: TableLookupBytes +// offsets are implicitly relative to the start of their 128-bit block. +template +HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) { + using T = MakeUnsigned>; + return detail::AndNotN(static_cast(LanesPerBlock(d) - 1), iota0); +} + +template +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint8_t idx_mod = + svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, + 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock, + 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock, + 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock, + 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock, + 15 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} +template +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint16_t idx_mod = + svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, + 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock, + 6 % kLanesPerBlock, 7 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} +template +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint32_t idx_mod = + svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, + 3 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} +template +svbool_t FirstNPerBlock(D d) { + const RebindToUnsigned du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + const svuint64_t idx_mod = + svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock); + return detail::LtN(BitCast(du, idx_mod), kLanes); +} + +} // namespace detail +#endif // HWY_TARGET != HWY_SVE2_128 + +template > +HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) { + const Repartition d8; + const auto hi8 = BitCast(d8, hi); + const auto lo8 = BitCast(d8, lo); +#if HWY_TARGET == HWY_SVE2_128 + return BitCast(d, detail::Ext(hi8, lo8)); +#else + const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes)); + const auto lo_down = detail::Ext(lo8, lo8); + const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8); + return BitCast(d, IfThenElse(is_lo, lo_down, hi_up)); +#endif +} + +// ------------------------------ Shuffle2301 +template +HWY_API V Shuffle2301(const V v) { + const DFromV d; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + return Reverse2(d, v); +} + +// ------------------------------ Shuffle2103 +template +HWY_API V Shuffle2103(const V v) { + const DFromV d; + const Repartition d8; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8)); +} + +// ------------------------------ Shuffle0321 +template +HWY_API V Shuffle0321(const V v) { + const DFromV d; + const Repartition d8; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8)); +} + +// ------------------------------ Shuffle1032 +template +HWY_API V Shuffle1032(const V v) { + const DFromV d; + const Repartition d8; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8)); +} + +// ------------------------------ Shuffle01 +template +HWY_API V Shuffle01(const V v) { + const DFromV d; + const Repartition d8; + static_assert(sizeof(TFromD) == 8, "Defined for 64-bit types"); + const svuint8_t v8 = BitCast(d8, v); + return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8)); +} + +// ------------------------------ Shuffle0123 +template +HWY_API V Shuffle0123(const V v) { + return Shuffle2301(Shuffle1032(v)); +} + +// ------------------------------ ReverseBlocks (Reverse, Shuffle01) +template > +HWY_API V ReverseBlocks(D d, V v) { +#if HWY_TARGET == HWY_SVE_256 + if (detail::IsFull(d)) { + return SwapAdjacentBlocks(v); + } else if (detail::IsFull(Twice())) { + return v; + } +#elif HWY_TARGET == HWY_SVE2_128 + (void)d; + return v; +#endif + const Repartition du64; + return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v)))); +} + +// ------------------------------ TableLookupBytes + +template +HWY_API VI TableLookupBytes(const V v, const VI idx) { + const DFromV d; + const Repartition du8; +#if HWY_TARGET == HWY_SVE2_128 + return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx))); +#else + const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0)); + const auto idx8 = Add(BitCast(du8, idx), offsets128); + return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8)); +#endif +} + +template +HWY_API VI TableLookupBytesOr0(const V v, const VI idx) { + const DFromV d; + // Mask size must match vector type, so cast everything to this type. + const Repartition di8; + + auto idx8 = BitCast(di8, idx); + const auto msb = detail::LtN(idx8, 0); + + const auto lookup = TableLookupBytes(BitCast(di8, v), idx8); + return BitCast(d, IfThenZeroElse(msb, lookup)); +} + +// ------------------------------ Broadcast + +#if HWY_TARGET == HWY_SVE2_128 +namespace detail { +#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(v, kLane); \ + } + +HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane) +#undef HWY_SVE_BROADCAST +} // namespace detail +#endif + +template +HWY_API V Broadcast(const V v) { + const DFromV d; + const RebindToUnsigned du; + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); + static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane"); +#if HWY_TARGET == HWY_SVE2_128 + return detail::BroadcastLane(v); +#else + auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0)); + if (kLane != 0) { + idx = detail::AddN(idx, kLane); + } + return TableLookupLanes(v, idx); +#endif +} + +// ------------------------------ ShiftLeftLanes + +template > +HWY_API V ShiftLeftLanes(D d, const V v) { + const auto zero = Zero(d); + const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes)); +#if HWY_TARGET == HWY_SVE2_128 + return shifted; +#else + // Match x86 semantics by zeroing lower lanes in 128-bit blocks + return IfThenElse(detail::FirstNPerBlock(d), zero, shifted); +#endif +} + +template +HWY_API V ShiftLeftLanes(const V v) { + return ShiftLeftLanes(DFromV(), v); +} + +// ------------------------------ ShiftRightLanes +template > +HWY_API V ShiftRightLanes(D d, V v) { + // For capped/fractional vectors, clear upper lanes so we shift in zeros. + if (!detail::IsFull(d)) { + v = IfThenElseZero(detail::MakeMask(d), v); + } + +#if HWY_TARGET == HWY_SVE2_128 + return detail::Ext(Zero(d), v); +#else + const auto shifted = detail::Ext(v, v); + // Match x86 semantics by zeroing upper lanes in 128-bit blocks + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d); + const svbool_t mask = detail::FirstNPerBlock(d); + return IfThenElseZero(mask, shifted); +#endif +} + +// ------------------------------ ShiftLeftBytes + +template > +HWY_API V ShiftLeftBytes(const D d, const V v) { + const Repartition d8; + return BitCast(d, ShiftLeftLanes(BitCast(d8, v))); +} + +template +HWY_API V ShiftLeftBytes(const V v) { + return ShiftLeftBytes(DFromV(), v); +} + +// ------------------------------ ShiftRightBytes +template > +HWY_API V ShiftRightBytes(const D d, const V v) { + const Repartition d8; + return BitCast(d, ShiftRightLanes(d8, BitCast(d8, v))); +} + +// ------------------------------ ZipLower + +template >> +HWY_API VFromD ZipLower(DW dw, V a, V b) { + const RepartitionToNarrow dn; + static_assert(IsSame, TFromV>(), "D/V mismatch"); + return BitCast(dw, InterleaveLower(dn, a, b)); +} +template , class DW = RepartitionToWide> +HWY_API VFromD ZipLower(const V a, const V b) { + return BitCast(DW(), InterleaveLower(D(), a, b)); +} + +// ------------------------------ ZipUpper +template >> +HWY_API VFromD ZipUpper(DW dw, V a, V b) { + const RepartitionToNarrow dn; + static_assert(IsSame, TFromV>(), "D/V mismatch"); + return BitCast(dw, InterleaveUpper(dn, a, b)); +} + +// ================================================== Ops with dependencies + +// ------------------------------ PromoteTo bfloat16 (ZipLower) +template +HWY_API svfloat32_t PromoteTo(Simd df32, + const svuint16_t v) { + return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), v)); +} + +// ------------------------------ ReorderDemote2To (OddEven) + +template +HWY_API svuint16_t ReorderDemote2To(Simd dbf16, + svfloat32_t a, svfloat32_t b) { + const RebindToUnsigned du16; + const Repartition du32; + const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +template +HWY_API svint16_t ReorderDemote2To(Simd d16, svint32_t a, + svint32_t b) { +#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 + (void)d16; + const svint16_t a_in_even = svqxtnb_s32(a); + return svqxtnt_s32(a_in_even, b); +#else + const Half dh; + const svint16_t a16 = BitCast(dh, detail::SaturateI(a)); + const svint16_t b16 = BitCast(dh, detail::SaturateI(b)); + return detail::InterleaveEven(a16, b16); +#endif +} + +// ------------------------------ ZeroIfNegative (Lt, IfThenElse) +template +HWY_API V ZeroIfNegative(const V v) { + return IfThenZeroElse(detail::LtN(v, 0), v); +} + +// ------------------------------ BroadcastSignBit (ShiftRight) +template +HWY_API V BroadcastSignBit(const V v) { + return ShiftRight) * 8 - 1>(v); +} + +// ------------------------------ IfNegativeThenElse (BroadcastSignBit) +template +HWY_API V IfNegativeThenElse(V v, V yes, V no) { + static_assert(IsSigned>(), "Only works for signed/float"); + const DFromV d; + const RebindToSigned di; + + const svbool_t m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); + return IfThenElse(m, yes, no); +} + +// ------------------------------ AverageRound (ShiftRight) + +#if HWY_TARGET == HWY_SVE2 +HWY_SVE_FOREACH_U08(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd) +HWY_SVE_FOREACH_U16(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd) +#else +template +V AverageRound(const V a, const V b) { + return ShiftRight<1>(detail::AddN(Add(a, b), 1)); +} +#endif // HWY_TARGET == HWY_SVE2 + +// ------------------------------ LoadMaskBits (TestBit) + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned du; + const svuint8_t iota = Iota(du, 0); + + // Load correct number of bytes (bits/8) with 7 zeros after each. + const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits)); + // Replicate bytes 8x such that each byte contains the bit that governs it. + const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota)); + + const svuint8_t bit = + svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + return TestBit(rep8, bit); +} + +template +HWY_INLINE svbool_t LoadMaskBits(D /* tag */, + const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned du; + const Repartition du8; + + // There may be up to 128 bits; avoid reading past the end. + const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits); + + // Replicate bytes 16x such that each lane contains the bit that governs it. + const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0))); + + const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128); + return TestBit(BitCast(du, rep16), bit); +} + +template +HWY_INLINE svbool_t LoadMaskBits(D /* tag */, + const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned du; + const Repartition du8; + + // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable, + // so we can skip computing the actual length (Lanes(du)+7)/8. + const svuint8_t bytes = svld1(FirstN(du8, 8), bits); + + // Replicate bytes 32x such that each lane contains the bit that governs it. + const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0))); + + // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 .. + const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7)); + + return TestBit(BitCast(du, rep32), bit); +} + +template +HWY_INLINE svbool_t LoadMaskBits(D /* tag */, + const uint8_t* HWY_RESTRICT bits) { + const RebindToUnsigned du; + + // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane. + // The "at least 8 byte" guarantee in quick_reference ensures this is safe. + uint32_t mask_bits; + CopyBytes<4>(bits, &mask_bits); // copy from bytes + const auto vbits = Set(du, mask_bits); + + // 2 ^ {0,1, .., 31}, will not have more lanes than that. + const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0)); + + return TestBit(vbits, bit); +} + +// ------------------------------ StoreMaskBits + +namespace detail { + +// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes. +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + return svdup_n_u8_z(m, 1); +} +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + const ScalableTag d8; + const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1)); + return detail::ConcatEvenFull(b16, b16); // lower half +} +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + return U8FromU32(svdup_n_u32_z(m, 1)); +} +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + const ScalableTag d32; + const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1)); + return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half +} + +// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane. +HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) { + const ScalableTag d8; + const ScalableTag d16; + const ScalableTag d32; + const ScalableTag d64; + // TODO(janwas): could use SVE2 BDEP, but it's optional. + x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x)))); + x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x)))); + x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x)))); + return BitCast(d64, x); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +// TODO(janwas): specialize for HWY_SVE_256 +template +HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) { + svuint64_t bits_in_u64 = + detail::BitsFromBool(detail::BoolFromMask>(m)); + + const size_t num_bits = Lanes(d); + const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below + + // Truncate each u64 to 8 bits and store to u8. + svst1b_u64(FirstN(ScalableTag(), num_bytes), bits, bits_in_u64); + + // Non-full byte, need to clear the undefined upper bits. Can happen for + // capped/fractional vectors or large T and small hardware vectors. + if (num_bits < 8) { + const int mask = static_cast((1ull << num_bits) - 1); + bits[0] = static_cast(bits[0] & mask); + } + // Else: we wrote full bytes because num_bits is a power of two >= 8. + + return num_bytes; +} + +// ------------------------------ CompressBits (LoadMaskBits) +template +HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(DFromV(), bits)); +} + +// ------------------------------ CompressBitsStore (LoadMaskBits) +template +HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +// ------------------------------ MulEven (InterleaveEven) + +#if HWY_TARGET == HWY_SVE2 +namespace detail { +#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \ + return sv##OP##_##CHAR##BITS(a, b); \ + } + +HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb) +#undef HWY_SVE_MUL_EVEN +} // namespace detail +#endif + +template >> +HWY_API VFromD MulEven(const V a, const V b) { +#if HWY_TARGET == HWY_SVE2 + return BitCast(DW(), detail::MulEvenNative(a, b)); +#else + const auto lo = Mul(a, b); + const auto hi = MulHigh(a, b); + return BitCast(DW(), detail::InterleaveEven(lo, hi)); +#endif +} + +HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) { + const auto lo = Mul(a, b); + const auto hi = MulHigh(a, b); + return detail::InterleaveEven(lo, hi); +} + +HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) { + const auto lo = Mul(a, b); + const auto hi = MulHigh(a, b); + return detail::InterleaveOdd(lo, hi); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template +HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd df32, + svuint16_t a, svuint16_t b, + const svfloat32_t sum0, + svfloat32_t& sum1) { + // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16. + const Repartition du16; + const RebindToUnsigned du32; + const svuint16_t zero = Zero(du16); + const svuint32_t a0 = ZipLower(du32, zero, BitCast(du16, a)); + const svuint32_t a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const svuint32_t b0 = ZipLower(du32, zero, BitCast(du16, b)); + const svuint32_t b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +template +HWY_API svint32_t ReorderWidenMulAccumulate(Simd d32, + svint16_t a, svint16_t b, + const svint32_t sum0, + svint32_t& sum1) { +#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 + (void)d32; + sum1 = svmlalt_s32(sum1, a, b); + return svmlalb_s32(sum0, a, b); +#else + const svbool_t pg = detail::PTrue(d32); + const svint32_t a0 = svunpklo_s32(a); + const svint32_t b0 = svunpklo_s32(b); + svint32_t a1, b1; + if (detail::IsFull(d32)) { + a1 = svunpkhi_s32(a); + b1 = svunpkhi_s32(b); + } else { + const Rebind d16h; + a1 = svunpklo_s32(UpperHalf(d16h, a)); + b1 = svunpklo_s32(UpperHalf(d16h, b)); + } + sum1 = svmla_s32_x(pg, sum1, a1, b1); + return svmla_s32_x(pg, sum0, a0, b0); +#endif +} + +// ------------------------------ AESRound / CLMul + +#if defined(__ARM_FEATURE_SVE2_AES) || \ + ((HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128) && \ + HWY_HAVE_RUNTIME_DISPATCH) + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) { + // It is not clear whether E and MC fuse like they did on NEON. + const svuint8_t zero = svdup_n_u8(0); + return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key); +} + +HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) { + return Xor(svaese_u8(state, svdup_n_u8(0)), round_key); +} + +HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) { + return svpmullb_pair(a, b); +} + +HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) { + return svpmullt_pair(a, b); +} + +#endif // __ARM_FEATURE_SVE2_AES + +// ------------------------------ Lt128 + +namespace detail { +#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \ + return sv##OP##_b##BITS(m, m); \ + } + +HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) // actually for bool +HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool +#undef HWY_SVE_DUP + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +template +HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const svbool_t eqHx = Eq(a, b); // only odd lanes used + // Convert to vector: more pipelines can execute vector TRN* instructions + // than the predicate version. + const svuint64_t ltHL = VecFromMask(d, Lt(a, b)); + // Move into upper lane: ltL if the upper half is equal, otherwise ltH. + // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated. + const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL); + // Duplicate upper lane into lower. + return DupOdd(ltHx); +} +#endif +} // namespace detail + +template +HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return MaskFromVec(detail::Lt128Vec(d, a, b)); +#else + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const svbool_t eqHx = Eq(a, b); // only odd lanes used + const svbool_t ltHL = Lt(a, b); + // Move into upper lane: ltL if the upper half is equal, otherwise ltH. + const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL); + // Duplicate upper lane into lower. + return detail::DupOddB(d, ltHx); +#endif // HWY_TARGET != HWY_SVE_256 +} + +// ------------------------------ Lt128Upper + +template +HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const svbool_t ltHL = Lt(a, b); + return detail::DupOddB(d, ltHL); +} + +// ------------------------------ Eq128, Ne128 + +#if HWY_TARGET == HWY_SVE_256 || HWY_IDE +namespace detail { + +template +HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + // Convert to vector: more pipelines can execute vector TRN* instructions + // than the predicate version. + const svuint64_t eqHL = VecFromMask(d, Eq(a, b)); + // Duplicate upper and lower. + const svuint64_t eqHH = DupOdd(eqHL); + const svuint64_t eqLL = DupEven(eqHL); + return And(eqLL, eqHH); +} + +template +HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + // Convert to vector: more pipelines can execute vector TRN* instructions + // than the predicate version. + const svuint64_t neHL = VecFromMask(d, Ne(a, b)); + // Duplicate upper and lower. + const svuint64_t neHH = DupOdd(neHL); + const svuint64_t neLL = DupEven(neHL); + return Or(neLL, neHH); +} + +} // namespace detail +#endif + +template +HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return MaskFromVec(detail::Eq128Vec(d, a, b)); +#else + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const svbool_t eqHL = Eq(a, b); + const svbool_t eqHH = detail::DupOddB(d, eqHL); + const svbool_t eqLL = detail::DupEvenB(d, eqHL); + return And(eqLL, eqHH); +#endif // HWY_TARGET != HWY_SVE_256 +} + +template +HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return MaskFromVec(detail::Ne128Vec(d, a, b)); +#else + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const svbool_t neHL = Ne(a, b); + const svbool_t neHH = detail::DupOddB(d, neHL); + const svbool_t neLL = detail::DupEvenB(d, neHL); + return Or(neLL, neHH); +#endif // HWY_TARGET != HWY_SVE_256 +} + +// ------------------------------ Eq128Upper, Ne128Upper + +template +HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const svbool_t eqHL = Eq(a, b); + return detail::DupOddB(d, eqHL); +} + +template +HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const svbool_t neHL = Ne(a, b); + return detail::DupOddB(d, neHL); +} + +// ------------------------------ Min128, Max128 (Lt128) + +template +HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); +#else + return IfThenElse(Lt128(d, a, b), a, b); +#endif +} + +template +HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) { +#if HWY_TARGET == HWY_SVE_256 + return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); +#else + return IfThenElse(Lt128(d, b, a), a, b); +#endif +} + +template +HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template +HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// ================================================== END MACROS +namespace detail { // for code folding +#undef HWY_IF_FLOAT_V +#undef HWY_IF_LANE_SIZE_V +#undef HWY_SVE_ALL_PTRUE +#undef HWY_SVE_D +#undef HWY_SVE_FOREACH +#undef HWY_SVE_FOREACH_F +#undef HWY_SVE_FOREACH_F16 +#undef HWY_SVE_FOREACH_F32 +#undef HWY_SVE_FOREACH_F64 +#undef HWY_SVE_FOREACH_I +#undef HWY_SVE_FOREACH_I08 +#undef HWY_SVE_FOREACH_I16 +#undef HWY_SVE_FOREACH_I32 +#undef HWY_SVE_FOREACH_I64 +#undef HWY_SVE_FOREACH_IF +#undef HWY_SVE_FOREACH_U +#undef HWY_SVE_FOREACH_U08 +#undef HWY_SVE_FOREACH_U16 +#undef HWY_SVE_FOREACH_U32 +#undef HWY_SVE_FOREACH_U64 +#undef HWY_SVE_FOREACH_UI +#undef HWY_SVE_FOREACH_UI08 +#undef HWY_SVE_FOREACH_UI16 +#undef HWY_SVE_FOREACH_UI32 +#undef HWY_SVE_FOREACH_UI64 +#undef HWY_SVE_FOREACH_UIF3264 +#undef HWY_SVE_PTRUE +#undef HWY_SVE_RETV_ARGPV +#undef HWY_SVE_RETV_ARGPVN +#undef HWY_SVE_RETV_ARGPVV +#undef HWY_SVE_RETV_ARGV +#undef HWY_SVE_RETV_ARGVN +#undef HWY_SVE_RETV_ARGVV +#undef HWY_SVE_T +#undef HWY_SVE_UNDEFINED +#undef HWY_SVE_V + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h new file mode 100644 index 0000000..5063a6d --- /dev/null +++ b/hwy/ops/emu128-inl.h @@ -0,0 +1,2511 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Single-element vectors and operations. +// External include guard in highway.h - see comment there. + +#include +#include + +#include "hwy/base.h" +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +using Full128 = Simd; + +// (Wrapper class required for overloading comparison operators.) +template +struct Vec128 { + HWY_INLINE Vec128() = default; + Vec128(const Vec128&) = default; + Vec128& operator=(const Vec128&) = default; + + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h + // relies on this for LoadInterleaved*. CAVEAT: this method of padding + // prevents using range for, especially in SumOfLanes, where it would be + // incorrect. Moving padding to another field would require handling the case + // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward. + T raw[16 / sizeof(T)] = {}; +}; + +// 0 or FF..FF, same size as Vec128. +template +struct Mask128 { + using Raw = hwy::MakeUnsigned; + static HWY_INLINE Raw FromBool(bool b) { + return b ? static_cast(~Raw{0}) : 0; + } + + // Must match the size of Vec128. + Raw bits[16 / sizeof(T)] = {}; +}; + +namespace detail { + +// Deduce Simd from Vec128 +struct Deduce128 { + template + Simd operator()(Vec128) const { + return Simd(); + } +}; + +} // namespace detail + +template +using DFromV = decltype(detail::Deduce128()(V())); + +template +using TFromV = TFromD>; + +// ------------------------------ BitCast + +template +HWY_API Vec128 BitCast(Simd /* tag */, Vec128 v) { + Vec128 to; + CopySameSize(&v, &to); + return to; +} + +// ------------------------------ Set + +template +HWY_API Vec128 Zero(Simd /* tag */) { + Vec128 v; + ZeroBytes(v.raw); + return v; +} + +template +using VFromD = decltype(Zero(D())); + +template +HWY_API Vec128 Set(Simd /* tag */, const T2 t) { + Vec128 v; + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast(t); + } + return v; +} + +template +HWY_API Vec128 Undefined(Simd d) { + return Zero(d); +} + +namespace detail { + +template +HWY_INLINE constexpr T IncrementWithWraparound(hwy::FloatTag /*tag*/, T t) { + return t + T{1}; +} + +template +HWY_INLINE constexpr T IncrementWithWraparound(hwy::NonFloatTag /*tag*/, T t) { + using TU = MakeUnsigned; + return static_cast(static_cast(static_cast(t) + TU{1}) & + hwy::LimitsMax()); +} + +} // namespace detail + +template +HWY_API Vec128 Iota(const Simd /* tag */, T2 first) { + Vec128 v; + T counter = static_cast(first); + for (size_t i = 0; i < N; ++i) { + v.raw[i] = counter; + counter = detail::IncrementWithWraparound(hwy::IsFloatTag(), counter); + } + return v; +} + +// ================================================== LOGICAL + +// ------------------------------ Not +template +HWY_API Vec128 Not(const Vec128 v) { + const Simd d; + const RebindToUnsigned du; + using TU = TFromD; + VFromD vu = BitCast(du, v); + for (size_t i = 0; i < N; ++i) { + vu.raw[i] = static_cast(~vu.raw[i]); + } + return BitCast(d, vu); +} + +// ------------------------------ And +template +HWY_API Vec128 And(const Vec128 a, const Vec128 b) { + const Simd d; + const RebindToUnsigned du; + auto au = BitCast(du, a); + auto bu = BitCast(du, b); + for (size_t i = 0; i < N; ++i) { + au.raw[i] &= bu.raw[i]; + } + return BitCast(d, au); +} +template +HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { + return And(a, b); +} + +// ------------------------------ AndNot +template +HWY_API Vec128 AndNot(const Vec128 a, const Vec128 b) { + return And(Not(a), b); +} + +// ------------------------------ Or +template +HWY_API Vec128 Or(const Vec128 a, const Vec128 b) { + const Simd d; + const RebindToUnsigned du; + auto au = BitCast(du, a); + auto bu = BitCast(du, b); + for (size_t i = 0; i < N; ++i) { + au.raw[i] |= bu.raw[i]; + } + return BitCast(d, au); +} +template +HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { + return Or(a, b); +} + +// ------------------------------ Xor +template +HWY_API Vec128 Xor(const Vec128 a, const Vec128 b) { + const Simd d; + const RebindToUnsigned du; + auto au = BitCast(du, a); + auto bu = BitCast(du, b); + for (size_t i = 0; i < N; ++i) { + au.raw[i] ^= bu.raw[i]; + } + return BitCast(d, au); +} +template +HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { + return Xor(a, b); +} + +// ------------------------------ Or3 + +template +HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd +template +HWY_API Vec128 OrAnd(const Vec128 o, const Vec128 a1, + const Vec128 a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse +template +HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, + Vec128 no) { + return Or(And(mask, yes), AndNot(mask, no)); +} + +// ------------------------------ CopySign +template +HWY_API Vec128 CopySign(const Vec128 magn, + const Vec128 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + const auto msb = SignBit(Simd()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template +HWY_API Vec128 CopySignToAbs(const Vec128 abs, + const Vec128 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(Simd()), sign)); +} + +// ------------------------------ BroadcastSignBit +template +HWY_API Vec128 BroadcastSignBit(Vec128 v) { + // This is used inside ShiftRight, so we cannot implement in terms of it. + for (size_t i = 0; i < N; ++i) { + v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0); + } + return v; +} + +// ------------------------------ Mask + +template +HWY_API Mask128 RebindMask(Simd /*tag*/, + Mask128 mask) { + Mask128 to; + CopySameSize(&mask, &to); + return to; +} + +// v must be 0 or FF..FF. +template +HWY_API Mask128 MaskFromVec(const Vec128 v) { + Mask128 mask; + CopySameSize(&v, &mask); + return mask; +} + +template +Vec128 VecFromMask(const Mask128 mask) { + Vec128 v; + CopySameSize(&mask, &v); + return v; +} + +template +Vec128 VecFromMask(Simd /* tag */, const Mask128 mask) { + return VecFromMask(mask); +} + +template +HWY_API Mask128 FirstN(Simd /*tag*/, size_t n) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128::FromBool(i < n); + } + return m; +} + +// Returns mask ? yes : no. +template +HWY_API Vec128 IfThenElse(const Mask128 mask, + const Vec128 yes, const Vec128 no) { + return IfVecThenElse(VecFromMask(mask), yes, no); +} + +template +HWY_API Vec128 IfThenElseZero(const Mask128 mask, + const Vec128 yes) { + return IfVecThenElse(VecFromMask(mask), yes, Zero(Simd())); +} + +template +HWY_API Vec128 IfThenZeroElse(const Mask128 mask, + const Vec128 no) { + return IfVecThenElse(VecFromMask(mask), Zero(Simd()), no); +} + +template +HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, + Vec128 no) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = v.raw[i] < 0 ? yes.raw[i] : no.raw[i]; + } + return v; +} + +template +HWY_API Vec128 ZeroIfNegative(const Vec128 v) { + return IfNegativeThenElse(v, Zero(Simd()), v); +} + +// ------------------------------ Mask logical + +template +HWY_API Mask128 Not(const Mask128 m) { + return MaskFromVec(Not(VecFromMask(Simd(), m))); +} + +template +HWY_API Mask128 And(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Or(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ================================================== SHIFTS + +// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) + +template +HWY_API Vec128 ShiftLeft(Vec128 v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + for (size_t i = 0; i < N; ++i) { + const auto shifted = static_cast>(v.raw[i]) << kBits; + v.raw[i] = static_cast(shifted); + } + return v; +} + +template +HWY_API Vec128 ShiftRight(Vec128 v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast(v.raw[i] >> kBits); + } +#else + if (IsSigned()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned; + for (size_t i = 0; i < N; ++i) { + const TU shifted = static_cast(static_cast(v.raw[i]) >> kBits); + const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; + const size_t sign_shift = + static_cast(static_cast(sizeof(TU)) * 8 - 1 - kBits); + const TU upper = static_cast(sign << sign_shift); + v.raw[i] = static_cast(shifted | upper); + } + } else { // T is unsigned + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast(v.raw[i] >> kBits); + } + } +#endif + return v; +} + +// ------------------------------ RotateRight (ShiftRight) + +namespace detail { + +// For partial specialization: kBits == 0 results in an invalid shift count +template +struct RotateRight { + template + HWY_INLINE Vec128 operator()(const Vec128 v) const { + return Or(ShiftRight(v), ShiftLeft(v)); + } +}; + +template <> +struct RotateRight<0> { + template + HWY_INLINE Vec128 operator()(const Vec128 v) const { + return v; + } +}; + +} // namespace detail + +template +HWY_API Vec128 RotateRight(const Vec128 v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + return detail::RotateRight()(v); +} + +// ------------------------------ ShiftLeftSame + +template +HWY_API Vec128 ShiftLeftSame(Vec128 v, int bits) { + for (size_t i = 0; i < N; ++i) { + const auto shifted = static_cast>(v.raw[i]) << bits; + v.raw[i] = static_cast(shifted); + } + return v; +} + +template +HWY_API Vec128 ShiftRightSame(Vec128 v, int bits) { +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast(v.raw[i] >> bits); + } +#else + if (IsSigned()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned; + for (size_t i = 0; i < N; ++i) { + const TU shifted = static_cast(static_cast(v.raw[i]) >> bits); + const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; + const size_t sign_shift = + static_cast(static_cast(sizeof(TU)) * 8 - 1 - bits); + const TU upper = static_cast(sign << sign_shift); + v.raw[i] = static_cast(shifted | upper); + } + } else { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast(v.raw[i] >> bits); // unsigned, logical shift + } + } +#endif + return v; +} + +// ------------------------------ Shl + +template +HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { + for (size_t i = 0; i < N; ++i) { + const auto shifted = static_cast>(v.raw[i]) + << bits.raw[i]; + v.raw[i] = static_cast(shifted); + } + return v; +} + +template +HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast(v.raw[i] >> bits.raw[i]); + } +#else + if (IsSigned()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned; + for (size_t i = 0; i < N; ++i) { + const TU shifted = + static_cast(static_cast(v.raw[i]) >> bits.raw[i]); + const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; + const size_t sign_shift = static_cast( + static_cast(sizeof(TU)) * 8 - 1 - bits.raw[i]); + const TU upper = static_cast(sign << sign_shift); + v.raw[i] = static_cast(shifted | upper); + } + } else { // T is unsigned + for (size_t i = 0; i < N; ++i) { + v.raw[i] = static_cast(v.raw[i] >> bits.raw[i]); + } + } +#endif + return v; +} + +// ================================================== ARITHMETIC + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_INLINE Vec128 Add(hwy::NonFloatTag /*tag*/, Vec128 a, + Vec128 b) { + for (size_t i = 0; i < N; ++i) { + const uint64_t a64 = static_cast(a.raw[i]); + const uint64_t b64 = static_cast(b.raw[i]); + a.raw[i] = static_cast((a64 + b64) & static_cast(~T(0))); + } + return a; +} +template +HWY_INLINE Vec128 Sub(hwy::NonFloatTag /*tag*/, Vec128 a, + Vec128 b) { + for (size_t i = 0; i < N; ++i) { + const uint64_t a64 = static_cast(a.raw[i]); + const uint64_t b64 = static_cast(b.raw[i]); + a.raw[i] = static_cast((a64 - b64) & static_cast(~T(0))); + } + return a; +} + +template +HWY_INLINE Vec128 Add(hwy::FloatTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] += b.raw[i]; + } + return a; +} + +template +HWY_INLINE Vec128 Sub(hwy::FloatTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] -= b.raw[i]; + } + return a; +} + +} // namespace detail + +template +HWY_API Vec128 operator-(Vec128 a, const Vec128 b) { + return detail::Sub(hwy::IsFloatTag(), a, b); +} +template +HWY_API Vec128 operator+(Vec128 a, const Vec128 b) { + return detail::Add(hwy::IsFloatTag(), a, b); +} + +// ------------------------------ SumsOf8 + +template +HWY_API Vec128 SumsOf8(const Vec128 v) { + Vec128 sums; + for (size_t i = 0; i < N; ++i) { + sums.raw[i / 8] += v.raw[i]; + } + return sums; +} + +// ------------------------------ SaturatedAdd +template +HWY_API Vec128 SaturatedAdd(Vec128 a, const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast( + HWY_MIN(HWY_MAX(hwy::LowestValue(), a.raw[i] + b.raw[i]), + hwy::HighestValue())); + } + return a; +} + +// ------------------------------ SaturatedSub +template +HWY_API Vec128 SaturatedSub(Vec128 a, const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast( + HWY_MIN(HWY_MAX(hwy::LowestValue(), a.raw[i] - b.raw[i]), + hwy::HighestValue())); + } + return a; +} + +// ------------------------------ AverageRound +template +HWY_API Vec128 AverageRound(Vec128 a, const Vec128 b) { + static_assert(!IsSigned(), "Only for unsigned"); + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast((a.raw[i] + b.raw[i] + 1) / 2); + } + return a; +} + +// ------------------------------ Abs + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_INLINE Vec128 Abs(SignedTag /*tag*/, Vec128 a) { + for (size_t i = 0; i < N; ++i) { + const T s = a.raw[i]; + const T min = hwy::LimitsMin(); + a.raw[i] = static_cast((s >= 0 || s == min) ? a.raw[i] : -s); + } + return a; +} + +template +HWY_INLINE Vec128 Abs(hwy::FloatTag /*tag*/, Vec128 v) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = std::abs(v.raw[i]); + } + return v; +} + +} // namespace detail + +template +HWY_API Vec128 Abs(Vec128 a) { + return detail::Abs(hwy::TypeTag(), a); +} + +// ------------------------------ Min/Max + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_INLINE Vec128 Min(hwy::NonFloatTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); + } + return a; +} +template +HWY_INLINE Vec128 Max(hwy::NonFloatTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); + } + return a; +} + +template +HWY_INLINE Vec128 Min(hwy::FloatTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + if (std::isnan(a.raw[i])) { + a.raw[i] = b.raw[i]; + } else if (std::isnan(b.raw[i])) { + // no change + } else { + a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); + } + } + return a; +} +template +HWY_INLINE Vec128 Max(hwy::FloatTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + if (std::isnan(a.raw[i])) { + a.raw[i] = b.raw[i]; + } else if (std::isnan(b.raw[i])) { + // no change + } else { + a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); + } + } + return a; +} + +} // namespace detail + +template +HWY_API Vec128 Min(Vec128 a, const Vec128 b) { + return detail::Min(hwy::IsFloatTag(), a, b); +} + +template +HWY_API Vec128 Max(Vec128 a, const Vec128 b) { + return detail::Max(hwy::IsFloatTag(), a, b); +} + +// ------------------------------ Neg + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_API Vec128 Neg(hwy::NonFloatTag /*tag*/, Vec128 v) { + return Zero(Simd()) - v; +} + +template +HWY_API Vec128 Neg(hwy::FloatTag /*tag*/, Vec128 v) { + return Xor(v, SignBit(Simd())); +} + +} // namespace detail + +template +HWY_API Vec128 Neg(Vec128 v) { + return detail::Neg(hwy::IsFloatTag(), v); +} + +// ------------------------------ Mul/Div + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_INLINE Vec128 Mul(hwy::FloatTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] *= b.raw[i]; + } + return a; +} + +template +HWY_INLINE Vec128 Mul(SignedTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast(static_cast(a.raw[i]) * b.raw[i]); + } + return a; +} + +template +HWY_INLINE Vec128 Mul(UnsignedTag /*tag*/, Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast(static_cast(a.raw[i]) * b.raw[i]); + } + return a; +} + +} // namespace detail + +template +HWY_API Vec128 operator*(Vec128 a, const Vec128 b) { + return detail::Mul(hwy::TypeTag(), a, b); +} + +template +HWY_API Vec128 operator/(Vec128 a, const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] /= b.raw[i]; + } + return a; +} + +// Returns the upper 16 bits of a * b in each lane. +template +HWY_API Vec128 MulHigh(Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast((int32_t{a.raw[i]} * b.raw[i]) >> 16); + } + return a; +} +template +HWY_API Vec128 MulHigh(Vec128 a, + const Vec128 b) { + for (size_t i = 0; i < N; ++i) { + // Cast to uint32_t first to prevent overflow. Otherwise the result of + // uint16_t * uint16_t is in "int" which may overflow. In practice the + // result is the same but this way it is also defined. + a.raw[i] = static_cast( + (static_cast(a.raw[i]) * static_cast(b.raw[i])) >> + 16); + } + return a; +} + +template +HWY_API Vec128 MulFixedPoint15(Vec128 a, + Vec128 b) { + for (size_t i = 0; i < N; ++i) { + a.raw[i] = static_cast((2 * a.raw[i] * b.raw[i] + 32768) >> 16); + } + return a; +} + +// Multiplies even lanes (0, 2 ..) and returns the double-wide result. +template +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + Vec128 mul; + for (size_t i = 0; i < N; i += 2) { + const int64_t a64 = a.raw[i]; + mul.raw[i / 2] = a64 * b.raw[i]; + } + return mul; +} +template +HWY_API Vec128 MulEven(Vec128 a, + const Vec128 b) { + Vec128 mul; + for (size_t i = 0; i < N; i += 2) { + const uint64_t a64 = a.raw[i]; + mul.raw[i / 2] = a64 * b.raw[i]; + } + return mul; +} + +template +HWY_API Vec128 MulOdd(const Vec128 a, + const Vec128 b) { + Vec128 mul; + for (size_t i = 0; i < N; i += 2) { + const int64_t a64 = a.raw[i + 1]; + mul.raw[i / 2] = a64 * b.raw[i + 1]; + } + return mul; +} +template +HWY_API Vec128 MulOdd(Vec128 a, + const Vec128 b) { + Vec128 mul; + for (size_t i = 0; i < N; i += 2) { + const uint64_t a64 = a.raw[i + 1]; + mul.raw[i / 2] = a64 * b.raw[i + 1]; + } + return mul; +} + +template +HWY_API Vec128 ApproximateReciprocal(Vec128 v) { + for (size_t i = 0; i < N; ++i) { + // Zero inputs are allowed, but callers are responsible for replacing the + // return value with something else (typically using IfThenElse). This check + // avoids a ubsan error. The result is arbitrary. + v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i]; + } + return v; +} + +template +HWY_API Vec128 AbsDiff(Vec128 a, const Vec128 b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +template +HWY_API Vec128 MulAdd(Vec128 mul, const Vec128 x, + const Vec128 add) { + return mul * x + add; +} + +template +HWY_API Vec128 NegMulAdd(Vec128 mul, const Vec128 x, + const Vec128 add) { + return add - mul * x; +} + +template +HWY_API Vec128 MulSub(Vec128 mul, const Vec128 x, + const Vec128 sub) { + return mul * x - sub; +} + +template +HWY_API Vec128 NegMulSub(Vec128 mul, const Vec128 x, + const Vec128 sub) { + return Neg(mul) * x - sub; +} + +// ------------------------------ Floating-point square root + +template +HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { + for (size_t i = 0; i < N; ++i) { + const float half = v.raw[i] * 0.5f; + uint32_t bits; + CopySameSize(&v.raw[i], &bits); + // Initial guess based on log2(f) + bits = 0x5F3759DF - (bits >> 1); + CopySameSize(&bits, &v.raw[i]); + // One Newton-Raphson iteration + v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i])); + } + return v; +} + +template +HWY_API Vec128 Sqrt(Vec128 v) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = std::sqrt(v.raw[i]); + } + return v; +} + +// ------------------------------ Floating-point rounding + +template +HWY_API Vec128 Round(Vec128 v) { + using TI = MakeSigned; + const Vec128 a = Abs(v); + for (size_t i = 0; i < N; ++i) { + if (!(a.raw[i] < MantissaEnd())) { // Huge or NaN + continue; + } + const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast(v.raw[i] + bias); + if (rounded == 0) { + v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0}; + continue; + } + const T rounded_f = static_cast(rounded); + // Round to even + if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) { + v.raw[i] = static_cast(rounded - (v.raw[i] < T(0) ? -1 : 1)); + continue; + } + v.raw[i] = rounded_f; + } + return v; +} + +// Round-to-nearest even. +template +HWY_API Vec128 NearestInt(const Vec128 v) { + using T = float; + using TI = int32_t; + + const Vec128 abs = Abs(v); + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + const bool signbit = std::signbit(v.raw[i]); + + if (!(abs.raw[i] < MantissaEnd())) { // Huge or NaN + // Check if too large to cast or NaN + if (!(abs.raw[i] <= static_cast(LimitsMax()))) { + ret.raw[i] = signbit ? LimitsMin() : LimitsMax(); + continue; + } + ret.raw[i] = static_cast(v.raw[i]); + continue; + } + const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast(v.raw[i] + bias); + if (rounded == 0) { + ret.raw[i] = 0; + continue; + } + const T rounded_f = static_cast(rounded); + // Round to even + if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) { + ret.raw[i] = rounded - (signbit ? -1 : 1); + continue; + } + ret.raw[i] = rounded; + } + return ret; +} + +template +HWY_API Vec128 Trunc(Vec128 v) { + using TI = MakeSigned; + const Vec128 abs = Abs(v); + for (size_t i = 0; i < N; ++i) { + if (!(abs.raw[i] <= MantissaEnd())) { // Huge or NaN + continue; + } + const TI truncated = static_cast(v.raw[i]); + if (truncated == 0) { + v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0}; + continue; + } + v.raw[i] = static_cast(truncated); + } + return v; +} + +// Toward +infinity, aka ceiling +template +Vec128 Ceil(Vec128 v) { + constexpr int kMantissaBits = MantissaBits(); + using Bits = MakeUnsigned; + const Bits kExponentMask = MaxExponentField(); + const Bits kMantissaMask = MantissaMask(); + const Bits kBias = kExponentMask / 2; + + for (size_t i = 0; i < N; ++i) { + const bool positive = v.raw[i] > Float(0.0); + + Bits bits; + CopySameSize(&v.raw[i], &bits); + + const int exponent = + static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) continue; + // |v| <= 1 => 0 or 1. + if (exponent < 0) { + v.raw[i] = positive ? Float{1} : Float{-0.0}; + continue; + } + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) continue; + + // Clear fractional bits and round up + if (positive) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &v.raw[i]); + } + return v; +} + +// Toward -infinity, aka floor +template +Vec128 Floor(Vec128 v) { + constexpr int kMantissaBits = MantissaBits(); + using Bits = MakeUnsigned; + const Bits kExponentMask = MaxExponentField(); + const Bits kMantissaMask = MantissaMask(); + const Bits kBias = kExponentMask / 2; + + for (size_t i = 0; i < N; ++i) { + const bool negative = v.raw[i] < Float(0.0); + + Bits bits; + CopySameSize(&v.raw[i], &bits); + + const int exponent = + static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) continue; + // |v| <= 1 => -1 or 0. + if (exponent < 0) { + v.raw[i] = negative ? Float(-1.0) : Float(0.0); + continue; + } + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) continue; + + // Clear fractional bits and round down + if (negative) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &v.raw[i]); + } + return v; +} + +// ------------------------------ Floating-point classification + +template +HWY_API Mask128 IsNaN(const Vec128 v) { + Mask128 ret; + for (size_t i = 0; i < N; ++i) { + // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. + MakeUnsigned bits; + CopySameSize(&v.raw[i], &bits); + bits += bits; + bits >>= 1; // clear sign bit + // NaN if all exponent bits are set and the mantissa is not zero. + ret.bits[i] = Mask128::FromBool(bits > ExponentMask()); + } + return ret; +} + +template +HWY_API Mask128 IsInf(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + const Simd d; + const RebindToSigned di; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); +} + +// Returns whether normal/subnormal/zero. +template +HWY_API Mask128 IsFinite(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + const Simd d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + using VI = VFromD; + using VU = VFromD; + const VU vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VI exp = + BitCast(di, ShiftRight() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); +} + +// ================================================== COMPARE + +template +HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128::FromBool(a.raw[i] == b.raw[i]); + } + return m; +} + +template +HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128::FromBool(a.raw[i] != b.raw[i]); + } + return m; +} + +template +HWY_API Mask128 TestBit(const Vec128 v, const Vec128 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +template +HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128::FromBool(a.raw[i] < b.raw[i]); + } + return m; +} +template +HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128::FromBool(a.raw[i] > b.raw[i]); + } + return m; +} + +template +HWY_API Mask128 operator<=(const Vec128 a, const Vec128 b) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128::FromBool(a.raw[i] <= b.raw[i]); + } + return m; +} +template +HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + m.bits[i] = Mask128::FromBool(a.raw[i] >= b.raw[i]); + } + return m; +} + +// ------------------------------ Lt128 + +// Only makes sense for full vectors of u64. +HWY_API Mask128 Lt128(Simd /* tag */, + Vec128 a, const Vec128 b) { + const bool lt = + (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]); + Mask128 ret; + ret.bits[0] = ret.bits[1] = Mask128::FromBool(lt); + return ret; +} + +HWY_API Mask128 Lt128Upper(Simd /* tag */, + Vec128 a, + const Vec128 b) { + const bool lt = a.raw[1] < b.raw[1]; + Mask128 ret; + ret.bits[0] = ret.bits[1] = Mask128::FromBool(lt); + return ret; +} + +// ------------------------------ Eq128 + +// Only makes sense for full vectors of u64. +HWY_API Mask128 Eq128(Simd /* tag */, + Vec128 a, const Vec128 b) { + const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0]; + Mask128 ret; + ret.bits[0] = ret.bits[1] = Mask128::FromBool(eq); + return ret; +} + +HWY_API Mask128 Ne128(Simd /* tag */, + Vec128 a, const Vec128 b) { + const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0]; + Mask128 ret; + ret.bits[0] = ret.bits[1] = Mask128::FromBool(ne); + return ret; +} + +HWY_API Mask128 Eq128Upper(Simd /* tag */, + Vec128 a, + const Vec128 b) { + const bool eq = a.raw[1] == b.raw[1]; + Mask128 ret; + ret.bits[0] = ret.bits[1] = Mask128::FromBool(eq); + return ret; +} + +HWY_API Mask128 Ne128Upper(Simd /* tag */, + Vec128 a, + const Vec128 b) { + const bool ne = a.raw[1] != b.raw[1]; + Mask128 ret; + ret.bits[0] = ret.bits[1] = Mask128::FromBool(ne); + return ret; +} + +// ------------------------------ Min128, Max128 (Lt128) + +template > +HWY_API V Min128(D d, const V a, const V b) { + return IfThenElse(Lt128(d, a, b), a, b); +} + +template > +HWY_API V Max128(D d, const V a, const V b) { + return IfThenElse(Lt128(d, b, a), a, b); +} + +template > +HWY_API V Min128Upper(D d, const V a, const V b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template > +HWY_API V Max128Upper(D d, const V a, const V b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template +HWY_API Vec128 Load(Simd /* tag */, + const T* HWY_RESTRICT aligned) { + Vec128 v; + CopyBytes(aligned, v.raw); // copy from array + return v; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, + const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +template +HWY_API Vec128 LoadU(Simd d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// In some use cases, "load single lane" is sufficient; otherwise avoid this. +template +HWY_API Vec128 LoadDup128(Simd d, + const T* HWY_RESTRICT aligned) { + return Load(d, aligned); +} + +// ------------------------------ Store + +template +HWY_API void Store(const Vec128 v, Simd /* tag */, + T* HWY_RESTRICT aligned) { + CopyBytes(v.raw, aligned); // copy to array +} + +template +HWY_API void StoreU(const Vec128 v, Simd d, T* HWY_RESTRICT p) { + Store(v, d, p); +} + +template +HWY_API void BlendedStore(const Vec128 v, Mask128 m, + Simd /* tag */, T* HWY_RESTRICT p) { + for (size_t i = 0; i < N; ++i) { + if (m.bits[i]) p[i] = v.raw[i]; + } +} + +// ------------------------------ LoadInterleaved2/3/4 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +// We implement those here because scalar code is likely faster than emulation +// via shuffles. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +template +HWY_API void LoadInterleaved2(Simd d, const T* HWY_RESTRICT unaligned, + Vec128& v0, Vec128& v1) { + alignas(16) T buf0[N]; + alignas(16) T buf1[N]; + for (size_t i = 0; i < N; ++i) { + buf0[i] = *unaligned++; + buf1[i] = *unaligned++; + } + v0 = Load(d, buf0); + v1 = Load(d, buf1); +} + +template +HWY_API void LoadInterleaved3(Simd d, const T* HWY_RESTRICT unaligned, + Vec128& v0, Vec128& v1, + Vec128& v2) { + alignas(16) T buf0[N]; + alignas(16) T buf1[N]; + alignas(16) T buf2[N]; + for (size_t i = 0; i < N; ++i) { + buf0[i] = *unaligned++; + buf1[i] = *unaligned++; + buf2[i] = *unaligned++; + } + v0 = Load(d, buf0); + v1 = Load(d, buf1); + v2 = Load(d, buf2); +} + +template +HWY_API void LoadInterleaved4(Simd d, const T* HWY_RESTRICT unaligned, + Vec128& v0, Vec128& v1, + Vec128& v2, Vec128& v3) { + alignas(16) T buf0[N]; + alignas(16) T buf1[N]; + alignas(16) T buf2[N]; + alignas(16) T buf3[N]; + for (size_t i = 0; i < N; ++i) { + buf0[i] = *unaligned++; + buf1[i] = *unaligned++; + buf2[i] = *unaligned++; + buf3[i] = *unaligned++; + } + v0 = Load(d, buf0); + v1 = Load(d, buf1); + v2 = Load(d, buf2); + v3 = Load(d, buf3); +} + +// ------------------------------ StoreInterleaved2/3/4 + +template +HWY_API void StoreInterleaved2(const Vec128 v0, const Vec128 v1, + Simd /* tag */, + T* HWY_RESTRICT unaligned) { + for (size_t i = 0; i < N; ++i) { + *unaligned++ = v0.raw[i]; + *unaligned++ = v1.raw[i]; + } +} + +template +HWY_API void StoreInterleaved3(const Vec128 v0, const Vec128 v1, + const Vec128 v2, Simd /* tag */, + T* HWY_RESTRICT unaligned) { + for (size_t i = 0; i < N; ++i) { + *unaligned++ = v0.raw[i]; + *unaligned++ = v1.raw[i]; + *unaligned++ = v2.raw[i]; + } +} + +template +HWY_API void StoreInterleaved4(const Vec128 v0, const Vec128 v1, + const Vec128 v2, const Vec128 v3, + Simd /* tag */, + T* HWY_RESTRICT unaligned) { + for (size_t i = 0; i < N; ++i) { + *unaligned++ = v0.raw[i]; + *unaligned++ = v1.raw[i]; + *unaligned++ = v2.raw[i]; + *unaligned++ = v3.raw[i]; + } +} + +// ------------------------------ Stream + +template +HWY_API void Stream(const Vec128 v, Simd d, + T* HWY_RESTRICT aligned) { + Store(v, d, aligned); +} + +// ------------------------------ Scatter + +template +HWY_API void ScatterOffset(Vec128 v, Simd /* tag */, T* base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + for (size_t i = 0; i < N; ++i) { + uint8_t* const base8 = reinterpret_cast(base) + offset.raw[i]; + CopyBytes(&v.raw[i], base8); // copy to bytes + } +} + +template +HWY_API void ScatterIndex(Vec128 v, Simd /* tag */, + T* HWY_RESTRICT base, const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + for (size_t i = 0; i < N; ++i) { + base[index.raw[i]] = v.raw[i]; + } +} + +// ------------------------------ Gather + +template +HWY_API Vec128 GatherOffset(Simd /* tag */, const T* base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + Vec128 v; + for (size_t i = 0; i < N; ++i) { + const uint8_t* base8 = + reinterpret_cast(base) + offset.raw[i]; + CopyBytes(base8, &v.raw[i]); // copy from bytes + } + return v; +} + +template +HWY_API Vec128 GatherIndex(Simd /* tag */, + const T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + Vec128 v; + for (size_t i = 0; i < N; ++i) { + v.raw[i] = base[index.raw[i]]; + } + return v; +} + +// ================================================== CONVERT + +// ConvertTo and DemoteTo with floating-point input and integer output truncate +// (rounding toward zero). + +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + Vec128 from) { + static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting"); + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + // For bits Y > X, floatX->floatY and intX->intY are always representable. + ret.raw[i] = static_cast(from.raw[i]); + } + return ret; +} + +// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here, +// so we overload for FromT=double and ToT={float,int32_t}. +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + Vec128 from) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + // Prevent ubsan errors when converting float to narrower integer/float + if (std::isinf(from.raw[i]) || + std::fabs(from.raw[i]) > static_cast(HighestValue())) { + ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue() + : HighestValue(); + continue; + } + ret.raw[i] = static_cast(from.raw[i]); + } + return ret; +} +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + Vec128 from) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + // Prevent ubsan errors when converting int32_t to narrower integer/int32_t + if (std::isinf(from.raw[i]) || + std::fabs(from.raw[i]) > static_cast(HighestValue())) { + ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue() + : HighestValue(); + continue; + } + ret.raw[i] = static_cast(from.raw[i]); + } + return ret; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + Vec128 from) { + static_assert(!IsFloat(), "FromT=double are handled above"); + static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting"); + + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + // Int to int: choose closest value in ToT to `from` (avoids UB) + from.raw[i] = + HWY_MIN(HWY_MAX(LimitsMin(), from.raw[i]), LimitsMax()); + ret.raw[i] = static_cast(from.raw[i]); + } + return ret; +} + +template +HWY_API Vec128 ReorderDemote2To( + Simd dbf16, Vec128 a, Vec128 b) { + const Repartition du32; + const Vec128 b_in_lower = ShiftRight<16>(BitCast(du32, b)); + // Avoid OddEven - we want the upper half of `a` even on big-endian systems. + const Vec128 a_mask = Set(du32, 0xFFFF0000); + return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower)); +} + +template +HWY_API Vec128 ReorderDemote2To(Simd /*d16*/, + Vec128 a, + Vec128 b) { + const int16_t min = LimitsMin(); + const int16_t max = LimitsMax(); + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast(HWY_MIN(HWY_MAX(min, a.raw[i]), max)); + } + for (size_t i = 0; i < N; ++i) { + ret.raw[N + i] = static_cast(HWY_MIN(HWY_MAX(min, b.raw[i]), max)); + } + return ret; +} + +namespace detail { + +HWY_INLINE void StoreU16ToF16(const uint16_t val, + hwy::float16_t* HWY_RESTRICT to) { + CopySameSize(&val, to); +} + +HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) { + uint16_t bits16; + CopySameSize(from, &bits16); + return bits16; +} + +} // namespace detail + +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + const uint16_t bits16 = detail::U16FromF16(&v.raw[i]); + const uint32_t sign = static_cast(bits16 >> 15); + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = + (1.0f / 16384) * (static_cast(mantissa) * (1.0f / 1024)); + ret.raw[i] = sign ? -subnormal : subnormal; + continue; + } + + // Normalized: convert the representation directly (faster than + // ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + CopySameSize(&bits32, &ret.raw[i]); + } + return ret; +} + +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = F32FromBF16(v.raw[i]); + } + return ret; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + uint32_t bits32; + CopySameSize(&v.raw[i], &bits32); + const uint32_t sign = bits32 >> 31; + const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; + const uint32_t mantissa32 = bits32 & 0x7FFFFF; + + const int32_t exp = HWY_MIN(static_cast(biased_exp32) - 127, 15); + + // Tiny or zero => zero. + if (exp < -24) { + ZeroBytes(&ret.raw[i]); + continue; + } + + uint32_t biased_exp16, mantissa16; + + // exp = [-24, -15] => subnormal + if (exp < -14) { + biased_exp16 = 0; + const uint32_t sub_exp = static_cast(-14 - exp); + HWY_DASSERT(1 <= sub_exp && sub_exp < 11); + mantissa16 = static_cast((1u << (10 - sub_exp)) + + (mantissa32 >> (13 + sub_exp))); + } else { + // exp = [-14, 15] + biased_exp16 = static_cast(exp + 15); + HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31); + mantissa16 = mantissa32 >> 13; + } + + HWY_DASSERT(mantissa16 < 1024); + const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; + HWY_DASSERT(bits16 < 0x10000); + const uint16_t narrowed = static_cast(bits16); // big-endian safe + detail::StoreU16ToF16(narrowed, &ret.raw[i]); + } + return ret; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = BF16FromF32(v.raw[i]); + } + return ret; +} + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_API Vec128 ConvertTo(hwy::FloatTag /*tag*/, + Simd /* tag */, + Vec128 from) { + static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size"); + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + // float## -> int##: return closest representable value. We cannot exactly + // represent LimitsMax in FromT, so use double. + const double f = static_cast(from.raw[i]); + if (std::isinf(from.raw[i]) || + std::fabs(f) > static_cast(LimitsMax())) { + ret.raw[i] = + std::signbit(from.raw[i]) ? LimitsMin() : LimitsMax(); + continue; + } + ret.raw[i] = static_cast(from.raw[i]); + } + return ret; +} + +template +HWY_API Vec128 ConvertTo(hwy::NonFloatTag /*tag*/, + Simd /* tag */, + Vec128 from) { + static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size"); + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + // int## -> float##: no check needed + ret.raw[i] = static_cast(from.raw[i]); + } + return ret; +} + +} // namespace detail + +template +HWY_API Vec128 ConvertTo(Simd d, Vec128 from) { + return detail::ConvertTo(hwy::IsFloatTag(), d, from); +} + +template +HWY_API Vec128 U8FromU32(const Vec128 v) { + return DemoteTo(Simd(), v); +} + +// ------------------------------ Truncations + +template +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast(v.raw[i] & 0xFF); + } + return ret; +} + +template +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast(v.raw[i] & 0xFFFF); + } + return ret; +} + +template +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast(v.raw[i] & 0xFFFFFFFFu); + } + return ret; +} + +template +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast(v.raw[i] & 0xFF); + } + return ret; +} + +template +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast(v.raw[i] & 0xFFFF); + } + return ret; +} + +template +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = static_cast(v.raw[i] & 0xFF); + } + return ret; +} + +// ================================================== COMBINE + +template +HWY_API Vec128 LowerHalf(Vec128 v) { + Vec128 ret; + CopyBytes(v.raw, ret.raw); + return ret; +} + +template +HWY_API Vec128 LowerHalf(Simd /* tag */, + Vec128 v) { + return LowerHalf(v); +} + +template +HWY_API Vec128 UpperHalf(Simd /* tag */, + Vec128 v) { + Vec128 ret; + CopyBytes(&v.raw[N / 2], ret.raw); + return ret; +} + +template +HWY_API Vec128 ZeroExtendVector(Simd /* tag */, + Vec128 v) { + Vec128 ret; + CopyBytes(v.raw, ret.raw); + return ret; +} + +template +HWY_API Vec128 Combine(Simd /* tag */, Vec128 hi_half, + Vec128 lo_half) { + Vec128 ret; + CopyBytes(lo_half.raw, &ret.raw[0]); + CopyBytes(hi_half.raw, &ret.raw[N / 2]); + return ret; +} + +template +HWY_API Vec128 ConcatLowerLower(Simd /* tag */, Vec128 hi, + Vec128 lo) { + Vec128 ret; + CopyBytes(lo.raw, &ret.raw[0]); + CopyBytes(hi.raw, &ret.raw[N / 2]); + return ret; +} + +template +HWY_API Vec128 ConcatUpperUpper(Simd /* tag */, Vec128 hi, + Vec128 lo) { + Vec128 ret; + CopyBytes(&lo.raw[N / 2], &ret.raw[0]); + CopyBytes(&hi.raw[N / 2], &ret.raw[N / 2]); + return ret; +} + +template +HWY_API Vec128 ConcatLowerUpper(Simd /* tag */, + const Vec128 hi, + const Vec128 lo) { + Vec128 ret; + CopyBytes(&lo.raw[N / 2], &ret.raw[0]); + CopyBytes(hi.raw, &ret.raw[N / 2]); + return ret; +} + +template +HWY_API Vec128 ConcatUpperLower(Simd /* tag */, Vec128 hi, + Vec128 lo) { + Vec128 ret; + CopyBytes(lo.raw, &ret.raw[0]); + CopyBytes(&hi.raw[N / 2], &ret.raw[N / 2]); + return ret; +} + +template +HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, + Vec128 lo) { + Vec128 ret; + for (size_t i = 0; i < N / 2; ++i) { + ret.raw[i] = lo.raw[2 * i]; + } + for (size_t i = 0; i < N / 2; ++i) { + ret.raw[N / 2 + i] = hi.raw[2 * i]; + } + return ret; +} + +template +HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, + Vec128 lo) { + Vec128 ret; + for (size_t i = 0; i < N / 2; ++i) { + ret.raw[i] = lo.raw[2 * i + 1]; + } + for (size_t i = 0; i < N / 2; ++i) { + ret.raw[N / 2 + i] = hi.raw[2 * i + 1]; + } + return ret; +} + +// ------------------------------ CombineShiftRightBytes + +template > +HWY_API V CombineShiftRightBytes(Simd /* tag */, V hi, V lo) { + V ret; + const uint8_t* HWY_RESTRICT lo8 = + reinterpret_cast(lo.raw); + uint8_t* HWY_RESTRICT ret8 = + reinterpret_cast(ret.raw); + CopyBytes(lo8 + kBytes, ret8); + CopyBytes(hi.raw, ret8 + sizeof(T) * N - kBytes); + return ret; +} + +// ------------------------------ ShiftLeftBytes + +template +HWY_API Vec128 ShiftLeftBytes(Simd /* tag */, Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + Vec128 ret; + uint8_t* HWY_RESTRICT ret8 = + reinterpret_cast(ret.raw); + ZeroBytes(ret8); + CopyBytes(v.raw, ret8 + kBytes); + return ret; +} + +template +HWY_API Vec128 ShiftLeftBytes(const Vec128 v) { + return ShiftLeftBytes(DFromV(), v); +} + +// ------------------------------ ShiftLeftLanes + +template +HWY_API Vec128 ShiftLeftLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); +} + +template +HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { + return ShiftLeftLanes(DFromV(), v); +} + +// ------------------------------ ShiftRightBytes +template +HWY_API Vec128 ShiftRightBytes(Simd /* tag */, Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + Vec128 ret; + const uint8_t* HWY_RESTRICT v8 = + reinterpret_cast(v.raw); + uint8_t* HWY_RESTRICT ret8 = + reinterpret_cast(ret.raw); + CopyBytes(v8 + kBytes, ret8); + ZeroBytes(ret8 + sizeof(T) * N - kBytes); + return ret; +} + +// ------------------------------ ShiftRightLanes +template +HWY_API Vec128 ShiftRightLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); +} + +// ================================================== SWIZZLE + +template +HWY_API T GetLane(const Vec128 v) { + return v.raw[0]; +} + +template +HWY_API Vec128 InsertLane(Vec128 v, size_t i, T t) { + v.raw[i] = t; + return v; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { + return v.raw[i]; +} + +template +HWY_API Vec128 DupEven(Vec128 v) { + for (size_t i = 0; i < N; i += 2) { + v.raw[i + 1] = v.raw[i]; + } + return v; +} + +template +HWY_API Vec128 DupOdd(Vec128 v) { + for (size_t i = 0; i < N; i += 2) { + v.raw[i] = v.raw[i + 1]; + } + return v; +} + +template +HWY_API Vec128 OddEven(Vec128 odd, Vec128 even) { + for (size_t i = 0; i < N; i += 2) { + odd.raw[i] = even.raw[i]; + } + return odd; +} + +template +HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { + return v; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template +struct Indices128 { + MakeSigned raw[N]; +}; + +template +HWY_API Indices128 IndicesFromVec(Simd, Vec128 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size"); + Indices128 ret; + CopyBytes(vec.raw, ret.raw); + return ret; +} + +template +HWY_API Indices128 SetTableIndices(Simd d, const TI* idx) { + return IndicesFromVec(d, LoadU(Simd(), idx)); +} + +template +HWY_API Vec128 TableLookupLanes(const Vec128 v, + const Indices128 idx) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = v.raw[idx.raw[i]]; + } + return ret; +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template +HWY_API Vec128 ReverseBlocks(Simd /* tag */, + const Vec128 v) { + return v; +} + +// ------------------------------ Reverse + +template +HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + ret.raw[i] = v.raw[N - 1 - i]; + } + return ret; +} + +template +HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; i += 2) { + ret.raw[i + 0] = v.raw[i + 1]; + ret.raw[i + 1] = v.raw[i + 0]; + } + return ret; +} + +template +HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; i += 4) { + ret.raw[i + 0] = v.raw[i + 3]; + ret.raw[i + 1] = v.raw[i + 2]; + ret.raw[i + 2] = v.raw[i + 1]; + ret.raw[i + 3] = v.raw[i + 0]; + } + return ret; +} + +template +HWY_API Vec128 Reverse8(Simd /* tag */, const Vec128 v) { + Vec128 ret; + for (size_t i = 0; i < N; i += 8) { + ret.raw[i + 0] = v.raw[i + 7]; + ret.raw[i + 1] = v.raw[i + 6]; + ret.raw[i + 2] = v.raw[i + 5]; + ret.raw[i + 3] = v.raw[i + 4]; + ret.raw[i + 4] = v.raw[i + 3]; + ret.raw[i + 5] = v.raw[i + 2]; + ret.raw[i + 6] = v.raw[i + 1]; + ret.raw[i + 7] = v.raw[i + 0]; + } + return ret; +} + +// ================================================== BLOCKWISE + +// ------------------------------ Shuffle* + +// Swap 32-bit halves in 64-bit halves. +template +HWY_API Vec128 Shuffle2301(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit"); + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Reverse2(DFromV(), v); +} + +// Swap 64-bit halves +template +HWY_API Vec128 Shuffle1032(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit"); + Vec128 ret; + ret.raw[3] = v.raw[1]; + ret.raw[2] = v.raw[0]; + ret.raw[1] = v.raw[3]; + ret.raw[0] = v.raw[2]; + return ret; +} +template +HWY_API Vec128 Shuffle01(const Vec128 v) { + static_assert(sizeof(T) == 8, "Only for 64-bit"); + return Reverse2(DFromV(), v); +} + +// Rotate right 32 bits +template +HWY_API Vec128 Shuffle0321(const Vec128 v) { + Vec128 ret; + ret.raw[3] = v.raw[0]; + ret.raw[2] = v.raw[3]; + ret.raw[1] = v.raw[2]; + ret.raw[0] = v.raw[1]; + return ret; +} + +// Rotate left 32 bits +template +HWY_API Vec128 Shuffle2103(const Vec128 v) { + Vec128 ret; + ret.raw[3] = v.raw[2]; + ret.raw[2] = v.raw[1]; + ret.raw[1] = v.raw[0]; + ret.raw[0] = v.raw[3]; + return ret; +} + +template +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Reverse4(DFromV(), v); +} + +// ------------------------------ Broadcast/splat any lane + +template +HWY_API Vec128 Broadcast(Vec128 v) { + for (size_t i = 0; i < N; ++i) { + v.raw[i] = v.raw[kLane]; + } + return v; +} + +// ------------------------------ TableLookupBytes, TableLookupBytesOr0 + +template +HWY_API Vec128 TableLookupBytes(const Vec128 v, + const Vec128 indices) { + const uint8_t* HWY_RESTRICT v_bytes = + reinterpret_cast(v.raw); + const uint8_t* HWY_RESTRICT idx_bytes = + reinterpret_cast(indices.raw); + Vec128 ret; + uint8_t* HWY_RESTRICT ret_bytes = + reinterpret_cast(ret.raw); + for (size_t i = 0; i < NI * sizeof(TI); ++i) { + const size_t idx = idx_bytes[i]; + // Avoid out of bounds reads. + ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0; + } + return ret; +} + +template +HWY_API Vec128 TableLookupBytesOr0(const Vec128 v, + const Vec128 indices) { + // Same as TableLookupBytes, which already returns 0 if out of bounds. + return TableLookupBytes(v, indices); +} + +// ------------------------------ InterleaveLower/InterleaveUpper + +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + Vec128 ret; + for (size_t i = 0; i < N / 2; ++i) { + ret.raw[2 * i + 0] = a.raw[i]; + ret.raw[2 * i + 1] = b.raw[i]; + } + return ret; +} + +// Additional overload for the optional tag (also for 256/512). +template +HWY_API V InterleaveLower(DFromV /* tag */, V a, V b) { + return InterleaveLower(a, b); +} + +template +HWY_API Vec128 InterleaveUpper(Simd /* tag */, + const Vec128 a, + const Vec128 b) { + Vec128 ret; + for (size_t i = 0; i < N / 2; ++i) { + ret.raw[2 * i + 0] = a.raw[N / 2 + i]; + ret.raw[2 * i + 1] = b.raw[N / 2 + i]; + } + return ret; +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template >> +HWY_API VFromD ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template , class DW = RepartitionToWide> +HWY_API VFromD ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template , class DW = RepartitionToWide> +HWY_API VFromD ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== MASK + +template +HWY_API bool AllFalse(Simd /* tag */, const Mask128 mask) { + typename Mask128::Raw or_sum = 0; + for (size_t i = 0; i < N; ++i) { + or_sum |= mask.bits[i]; + } + return or_sum == 0; +} + +template +HWY_API bool AllTrue(Simd /* tag */, const Mask128 mask) { + constexpr uint64_t kAll = LimitsMax::Raw>(); + uint64_t and_sum = kAll; + for (size_t i = 0; i < N; ++i) { + and_sum &= mask.bits[i]; + } + return and_sum == kAll; +} + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask128 LoadMaskBits(Simd /* tag */, + const uint8_t* HWY_RESTRICT bits) { + Mask128 m; + for (size_t i = 0; i < N; ++i) { + const size_t bit = size_t{1} << (i & 7); + const size_t idx_byte = i >> 3; + m.bits[i] = Mask128::FromBool((bits[idx_byte] & bit) != 0); + } + return m; +} + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(Simd /* tag */, const Mask128 mask, + uint8_t* bits) { + bits[0] = 0; + if (N > 8) bits[1] = 0; // N <= 16, so max two bytes + for (size_t i = 0; i < N; ++i) { + const size_t bit = size_t{1} << (i & 7); + const size_t idx_byte = i >> 3; + if (mask.bits[i]) { + bits[idx_byte] = static_cast(bits[idx_byte] | bit); + } + } + return N > 8 ? 2 : 1; +} + +template +HWY_API size_t CountTrue(Simd /* tag */, const Mask128 mask) { + size_t count = 0; + for (size_t i = 0; i < N; ++i) { + count += mask.bits[i] != 0; + } + return count; +} + +template +HWY_API size_t FindKnownFirstTrue(Simd /* tag */, + const Mask128 mask) { + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i] != 0) return i; + } + HWY_DASSERT(false); + return 0; +} + +template +HWY_API intptr_t FindFirstTrue(Simd /* tag */, + const Mask128 mask) { + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i] != 0) return static_cast(i); + } + return intptr_t{-1}; +} + +// ------------------------------ Compress + +template +struct CompressIsPartition { + enum { value = 1 }; +}; + +template +HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { + size_t count = 0; + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + for (size_t i = 0; i < N; ++i) { + if (!mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + HWY_DASSERT(count == N); + return ret; +} + +// ------------------------------ CompressNot +template +HWY_API Vec128 CompressNot(Vec128 v, const Mask128 mask) { + size_t count = 0; + Vec128 ret; + for (size_t i = 0; i < N; ++i) { + if (!mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i]) { + ret.raw[count++] = v.raw[i]; + } + } + HWY_DASSERT(count == N); + return ret; +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128 CompressBlocksNot(Vec128 v, + Mask128 /* m */) { + return v; +} + +// ------------------------------ CompressBits +template +HWY_API Vec128 CompressBits(Vec128 v, + const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(Simd(), bits)); +} + +// ------------------------------ CompressStore +template +HWY_API size_t CompressStore(Vec128 v, const Mask128 mask, + Simd /* tag */, + T* HWY_RESTRICT unaligned) { + size_t count = 0; + for (size_t i = 0; i < N; ++i) { + if (mask.bits[i]) { + unaligned[count++] = v.raw[i]; + } + } + return count; +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(Vec128 v, const Mask128 mask, + Simd d, + T* HWY_RESTRICT unaligned) { + return CompressStore(v, mask, d, unaligned); +} + +// ------------------------------ CompressBitsStore +template +HWY_API size_t CompressBitsStore(Vec128 v, + const uint8_t* HWY_RESTRICT bits, + Simd d, T* HWY_RESTRICT unaligned) { + const Mask128 mask = LoadMaskBits(d, bits); + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template +HWY_API Vec128 ReorderWidenMulAccumulate(Simd df32, + Vec128 a, + Vec128 b, + const Vec128 sum0, + Vec128& sum1) { + const Rebind dbf16; + // Avoid ZipLower/Upper so this also works on big-endian systems. + const Vec128 a0 = PromoteTo(df32, LowerHalf(dbf16, a)); + const Vec128 a1 = PromoteTo(df32, UpperHalf(dbf16, a)); + const Vec128 b0 = PromoteTo(df32, LowerHalf(dbf16, b)); + const Vec128 b1 = PromoteTo(df32, UpperHalf(dbf16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +template +HWY_API Vec128 ReorderWidenMulAccumulate( + Simd d32, Vec128 a, Vec128 b, + const Vec128 sum0, Vec128& sum1) { + const Rebind d16; + // Avoid ZipLower/Upper so this also works on big-endian systems. + const Vec128 a0 = PromoteTo(d32, LowerHalf(d16, a)); + const Vec128 a1 = PromoteTo(d32, UpperHalf(d16, a)); + const Vec128 b0 = PromoteTo(d32, LowerHalf(d16, b)); + const Vec128 b1 = PromoteTo(d32, UpperHalf(d16, b)); + sum1 = MulAdd(BitCast(d32, a1), BitCast(d32, b1), sum1); + return MulAdd(BitCast(d32, a0), BitCast(d32, b0), sum0); +} + +// ================================================== REDUCTIONS + +template +HWY_API Vec128 SumOfLanes(Simd d, const Vec128 v) { + T sum = T{0}; + for (size_t i = 0; i < N; ++i) { + sum += v.raw[i]; + } + return Set(d, sum); +} +template +HWY_API Vec128 MinOfLanes(Simd d, const Vec128 v) { + T min = HighestValue(); + for (size_t i = 0; i < N; ++i) { + min = HWY_MIN(min, v.raw[i]); + } + return Set(d, min); +} +template +HWY_API Vec128 MaxOfLanes(Simd d, const Vec128 v) { + T max = LowestValue(); + for (size_t i = 0; i < N; ++i) { + max = HWY_MAX(max, v.raw[i]); + } + return Set(d, max); +} + +// ================================================== OPS WITH DEPENDENCIES + +// ------------------------------ MulEven/Odd 64x64 (UpperHalf) + +HWY_INLINE Vec128 MulEven(const Vec128 a, + const Vec128 b) { + alignas(16) uint64_t mul[2]; + mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); + return Load(Full128(), mul); +} + +HWY_INLINE Vec128 MulOdd(const Vec128 a, + const Vec128 b) { + alignas(16) uint64_t mul[2]; + const Half> d2; + mul[0] = + Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); + return Load(Full128(), mul); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h new file mode 100644 index 0000000..b01c5de --- /dev/null +++ b/hwy/ops/generic_ops-inl.h @@ -0,0 +1,1357 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Target-independent types/functions defined after target-specific ops. + +// Relies on the external include guard in highway.h. +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// The lane type of a vector type, e.g. float for Vec>. +template +using LaneType = decltype(GetLane(V())); + +// Vector type, e.g. Vec128 for CappedTag. Useful as the return +// type of functions that do not take a vector argument, or as an argument type +// if the function only has a template argument for D, or for explicit type +// names instead of auto. This may be a built-in type. +template +using Vec = decltype(Zero(D())); + +// Mask type. Useful as the return type of functions that do not take a mask +// argument, or as an argument type if the function only has a template argument +// for D, or for explicit type names instead of auto. +template +using Mask = decltype(MaskFromVec(Zero(D()))); + +// Returns the closest value to v within [lo, hi]. +template +HWY_API V Clamp(const V v, const V lo, const V hi) { + return Min(Max(lo, v), hi); +} + +// CombineShiftRightBytes (and -Lanes) are not available for the scalar target, +// and RVV has its own implementation of -Lanes. +#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV + +template > +HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) { + constexpr size_t kBytes = kLanes * sizeof(LaneType); + static_assert(kBytes < 16, "Shift count is per-block"); + return CombineShiftRightBytes(d, hi, lo); +} + +#endif + +// Returns lanes with the most significant bit set and all other bits zero. +template +HWY_API Vec SignBit(D d) { + const RebindToUnsigned du; + return BitCast(d, Set(du, SignMask>())); +} + +// Returns quiet NaN. +template +HWY_API Vec NaN(D d) { + const RebindToSigned di; + // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus + // mantissa MSB (to indicate quiet) would be sufficient. + return BitCast(d, Set(di, LimitsMax>())); +} + +// Returns positive infinity. +template +HWY_API Vec Inf(D d) { + const RebindToUnsigned du; + using T = TFromD; + using TU = TFromD; + const TU max_x2 = static_cast(MaxExponentTimes2()); + return BitCast(d, Set(du, max_x2 >> 1)); +} + +// ------------------------------ SafeFillN + +template > +HWY_API void SafeFillN(const size_t num, const T value, D d, + T* HWY_RESTRICT to) { +#if HWY_MEM_OPS_MIGHT_FAULT + (void)d; + for (size_t i = 0; i < num; ++i) { + to[i] = value; + } +#else + BlendedStore(Set(d, value), FirstN(d, num), d, to); +#endif +} + +// ------------------------------ SafeCopyN + +template > +HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, + T* HWY_RESTRICT to) { +#if HWY_MEM_OPS_MIGHT_FAULT + (void)d; + for (size_t i = 0; i < num; ++i) { + to[i] = from[i]; + } +#else + const Mask mask = FirstN(d, num); + BlendedStore(MaskedLoad(mask, d, from), mask, d, to); +#endif +} + +// "Include guard": skip if native instructions are available. The generic +// implementation is currently shared between x86_* and wasm_*, and is too large +// to duplicate. + +#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +// ------------------------------ LoadInterleaved2 + +template +HWY_API void LoadInterleaved2(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1) { + const V A = LoadU(d, unaligned + 0 * N); // v1[1] v0[1] v1[0] v0[0] + const V B = LoadU(d, unaligned + 1 * N); + v0 = ConcatEven(d, B, A); + v1 = ConcatOdd(d, B, A); +} + +template +HWY_API void LoadInterleaved2(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); +} + +// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes) + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template +HWY_API void LoadTransposedBlocks3(Simd d, + const T* HWY_RESTRICT unaligned, V& A, V& B, + V& C) { + A = LoadU(d, unaligned + 0 * N); + B = LoadU(d, unaligned + 1 * N); + C = LoadU(d, unaligned + 2 * N); +} + +} // namespace detail + +template +HWY_API void LoadInterleaved3(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2) { + const RebindToUnsigned du; + // Compact notation so these fit on one line: 12 := v1[2]. + V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00 + V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15 + V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + // Compress all lanes belonging to v0 into consecutive lanes. + constexpr uint8_t Z = 0x80; + alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z, + Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, Z, Z, Z, 2, 5, + 8, 11, 14, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z, Z, + Z, Z, Z, 1, 4, 7, 10, 13}; + alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z, + Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, Z, Z, 0, 3, 6, + 9, 12, 15, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z, Z, + Z, Z, Z, 2, 5, 8, 11, 14}; + alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z, + Z, Z, Z, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, Z, Z, Z, 1, 4, 7, + 10, 13, Z, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z, Z, + Z, Z, 0, 3, 6, 9, 12, 15}; + const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); + const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); + const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); + const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); + const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); + const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); + const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); + const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); + const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); + v0 = Or3(v0L, v0M, v0U); + v1 = Or3(v1L, v1M, v1U); + v2 = Or3(v2L, v2M, v2U); +} + +// 8-bit lanes x8 +template +HWY_API void LoadInterleaved3(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2) { + const RebindToUnsigned du; + V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] + V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] + V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + // Compress all lanes belonging to v0 into consecutive lanes. + constexpr uint8_t Z = 0x80; + alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5}; + alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6}; + alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z}; + alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7}; + const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); + const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); + const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); + const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); + const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); + const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); + const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); + const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); + const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); + v0 = Or3(v0L, v0M, v0U); + v1 = Or3(v1L, v1M, v1U); + v2 = Or3(v2L, v2M, v2U); +} + +// 16-bit lanes x8 +template +HWY_API void LoadInterleaved3(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2) { + const RebindToUnsigned du; + V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] + V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] + V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + // Compress all lanes belonging to v0 into consecutive lanes. Same as above, + // but each element of the array contains two byte indices for a lane. + constexpr uint16_t Z = 0x8080; + alignas(16) constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z, + Z, Z, Z, Z}; + alignas(16) constexpr uint16_t kIdx_v0B[8] = {Z, Z, Z, 0x0302, + 0x0908, 0x0F0E, Z, Z}; + alignas(16) constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z, Z, + Z, Z, 0x0504, 0x0B0A}; + alignas(16) constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z, + Z, Z, Z, Z}; + alignas(16) constexpr uint16_t kIdx_v1B[8] = {Z, Z, Z, 0x0504, + 0x0B0A, Z, Z, Z}; + alignas(16) constexpr uint16_t kIdx_v1C[8] = {Z, Z, Z, Z, + Z, 0x0100, 0x0706, 0x0D0C}; + alignas(16) constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z, + Z, Z, Z, Z}; + alignas(16) constexpr uint16_t kIdx_v2B[8] = {Z, Z, 0x0100, 0x0706, + 0x0D0C, Z, Z, Z}; + alignas(16) constexpr uint16_t kIdx_v2C[8] = {Z, Z, Z, Z, + Z, 0x0302, 0x0908, 0x0F0E}; + const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); + const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); + const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); + const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); + const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); + const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); + const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); + const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); + const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); + v0 = Or3(v0L, v0M, v0U); + v1 = Or3(v1L, v1M, v1U); + v2 = Or3(v2L, v2M, v2U); +} + +template +HWY_API void LoadInterleaved3(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2) { + V A; // v0[1] v2[0] v1[0] v0[0] + V B; // v1[2] v0[2] v2[1] v1[1] + V C; // v2[3] v1[3] v0[3] v2[2] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + + const V vxx_02_03_xx = OddEven(C, B); + v0 = detail::Shuffle1230(A, vxx_02_03_xx); + + // Shuffle2301 takes the upper/lower halves of the output from one input, so + // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use + // OddEven because it may have higher throughput than Shuffle. + const V vxx_xx_10_11 = OddEven(A, B); + const V v12_13_xx_xx = OddEven(B, C); + v1 = detail::Shuffle2301(vxx_xx_10_11, v12_13_xx_xx); + + const V vxx_20_21_xx = OddEven(B, A); + v2 = detail::Shuffle3012(vxx_20_21_xx, C); +} + +template +HWY_API void LoadInterleaved3(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2) { + V A; // v1[0] v0[0] + V B; // v0[1] v2[0] + V C; // v2[1] v1[1] + detail::LoadTransposedBlocks3(d, unaligned, A, B, C); + v0 = OddEven(B, A); + v1 = CombineShiftRightBytes(d, C, A); + v2 = OddEven(C, B); +} + +template +HWY_API void LoadInterleaved3(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); +} + +// ------------------------------ LoadInterleaved4 + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template +HWY_API void LoadTransposedBlocks4(Simd d, + const T* HWY_RESTRICT unaligned, V& A, V& B, + V& C, V& D) { + A = LoadU(d, unaligned + 0 * N); + B = LoadU(d, unaligned + 1 * N); + C = LoadU(d, unaligned + 2 * N); + D = LoadU(d, unaligned + 3 * N); +} + +} // namespace detail + +template +HWY_API void LoadInterleaved4(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2, V& v3) { + const Repartition d64; + using V64 = VFromD; + // 16 lanes per block; the lowest four blocks are at the bottom of A,B,C,D. + // Here int[i] means the four interleaved values of the i-th 4-tuple and + // int[3..0] indicates four consecutive 4-tuples (0 = least-significant). + V A; // int[13..10] int[3..0] + V B; // int[17..14] int[7..4] + V C; // int[1b..18] int[b..8] + V D; // int[1f..1c] int[f..c] + detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D); + + // For brevity, the comments only list the lower block (upper = lower + 0x10) + const V v5140 = InterleaveLower(d, A, B); // int[5,1,4,0] + const V vd9c8 = InterleaveLower(d, C, D); // int[d,9,c,8] + const V v7362 = InterleaveUpper(d, A, B); // int[7,3,6,2] + const V vfbea = InterleaveUpper(d, C, D); // int[f,b,e,a] + + const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] + const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] + const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] + const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9] + + const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0] + const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8] + const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0] + const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8] + + v0 = BitCast(d, InterleaveLower(d64, v10L, v10U)); + v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U)); + v2 = BitCast(d, InterleaveLower(d64, v32L, v32U)); + v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U)); +} + +template +HWY_API void LoadInterleaved4(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2, V& v3) { + // In the last step, we interleave by half of the block size, which is usually + // 8 bytes but half that for 8-bit x8 vectors. + using TW = hwy::UnsignedFromSize; + const Repartition dw; + using VW = VFromD; + + // (Comments are for 256-bit vectors.) + // 8 lanes per block; the lowest four blocks are at the bottom of A,B,C,D. + V A; // v3210[9]v3210[8] v3210[1]v3210[0] + V B; // v3210[b]v3210[a] v3210[3]v3210[2] + V C; // v3210[d]v3210[c] v3210[5]v3210[4] + V D; // v3210[f]v3210[e] v3210[7]v3210[6] + detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D); + + const V va820 = InterleaveLower(d, A, B); // v3210[a,8] v3210[2,0] + const V vec64 = InterleaveLower(d, C, D); // v3210[e,c] v3210[6,4] + const V vb931 = InterleaveUpper(d, A, B); // v3210[b,9] v3210[3,1] + const V vfd75 = InterleaveUpper(d, C, D); // v3210[f,d] v3210[7,5] + + const VW v10_b830 = // v10[b..8] v10[3..0] + BitCast(dw, InterleaveLower(d, va820, vb931)); + const VW v10_fc74 = // v10[f..c] v10[7..4] + BitCast(dw, InterleaveLower(d, vec64, vfd75)); + const VW v32_b830 = // v32[b..8] v32[3..0] + BitCast(dw, InterleaveUpper(d, va820, vb931)); + const VW v32_fc74 = // v32[f..c] v32[7..4] + BitCast(dw, InterleaveUpper(d, vec64, vfd75)); + + v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74)); + v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74)); + v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74)); + v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74)); +} + +template +HWY_API void LoadInterleaved4(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2, V& v3) { + V A; // v3210[4] v3210[0] + V B; // v3210[5] v3210[1] + V C; // v3210[6] v3210[2] + V D; // v3210[7] v3210[3] + detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D); + const V v10_ev = InterleaveLower(d, A, C); // v1[6,4] v0[6,4] v1[2,0] v0[2,0] + const V v10_od = InterleaveLower(d, B, D); // v1[7,5] v0[7,5] v1[3,1] v0[3,1] + const V v32_ev = InterleaveUpper(d, A, C); // v3[6,4] v2[6,4] v3[2,0] v2[2,0] + const V v32_od = InterleaveUpper(d, B, D); // v3[7,5] v2[7,5] v3[3,1] v2[3,1] + + v0 = InterleaveLower(d, v10_ev, v10_od); + v1 = InterleaveUpper(d, v10_ev, v10_od); + v2 = InterleaveLower(d, v32_ev, v32_od); + v3 = InterleaveUpper(d, v32_ev, v32_od); +} + +template +HWY_API void LoadInterleaved4(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2, V& v3) { + V A, B, C, D; + detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D); + v0 = InterleaveLower(d, A, C); + v1 = InterleaveUpper(d, A, C); + v2 = InterleaveLower(d, B, D); + v3 = InterleaveUpper(d, B, D); +} + +// Any T x1 +template +HWY_API void LoadInterleaved4(Simd d, const T* HWY_RESTRICT unaligned, + V& v0, V& v1, V& v2, V& v3) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); + v3 = LoadU(d, unaligned + 3); +} + +// ------------------------------ StoreInterleaved2 + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template +HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd d, + T* HWY_RESTRICT unaligned) { + StoreU(A, d, unaligned + 0 * N); + StoreU(B, d, unaligned + 1 * N); +} + +} // namespace detail + +// >= 128 bit vector +template +HWY_API void StoreInterleaved2(const V v0, const V v1, Simd d, + T* HWY_RESTRICT unaligned) { + const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0] + const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[N/2] v0[N/2] + detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned); +} + +// 64 bits +template +HWY_API void StoreInterleaved2(const Vec64 part0, const Vec64 part1, + Full64 /*tag*/, T* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Full128 d_full; + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const auto v10 = InterleaveLower(d_full, v0, v1); + StoreU(v10, d_full, unaligned); +} + +// <= 32 bits +template +HWY_API void StoreInterleaved2(const Vec128 part0, + const Vec128 part1, Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Full128 d_full; + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const auto v10 = InterleaveLower(d_full, v0, v1); + alignas(16) T buf[16 / sizeof(T)]; + StoreU(v10, d_full, buf); + CopyBytes<2 * N * sizeof(T)>(buf, unaligned); +} + +// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, +// TableLookupBytes) + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template +HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, + Simd d, + T* HWY_RESTRICT unaligned) { + StoreU(A, d, unaligned + 0 * N); + StoreU(B, d, unaligned + 1 * N); + StoreU(C, d, unaligned + 2 * N); +} + +} // namespace detail + +// >= 128-bit vector, 8-bit lanes +template +HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2, + Simd d, T* HWY_RESTRICT unaligned) { + const RebindToUnsigned du; + const auto k5 = Set(du, 5); + const auto k6 = Set(du, 6); + + // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): + // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes + // to their place, with 0x80 so lanes to be filled from other vectors are 0 + // to enable blending by ORing together. + alignas(16) static constexpr uint8_t tbl_v0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + // The interleaved vectors will be named A, B, C; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A0 = LoadDup128(du, tbl_v0); + const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5) + const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. + const V A = BitCast(d, A0 | A1 | A2); + + // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5] + const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6.. + const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5 + const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5. + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const V B = BitCast(d, B0 | B1 | B2); + + // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10] + const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B. + const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B.. + const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A + const auto C0 = TableLookupBytesOr0(v0, shuf_C0); + const auto C1 = TableLookupBytesOr0(v1, shuf_C1); + const auto C2 = TableLookupBytesOr0(v2, shuf_C2); + const V C = BitCast(d, C0 | C1 | C2); + + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// >= 128-bit vector, 16-bit lanes +template +HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2, + Simd d, T* HWY_RESTRICT unaligned) { + const Repartition du8; + const auto k2 = Set(du8, 2 * sizeof(T)); + const auto k3 = Set(du8, 3 * sizeof(T)); + + // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): + // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be + // filled from other vectors are 0 for blending. Note that these are byte + // indices for 16-bit lanes. + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, + 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; + alignas(16) static constexpr uint8_t tbl_v2[16] = { + 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, + 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; + + // The interleaved vectors will be named A, B, C; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0. + // .2..1..0 + const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); + const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0.. + + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); + const V A = BitCast(d, A0 | A1 | A2); + + // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2] + const auto shuf_B0 = shuf_A1 + k3; // 5..4..3. + const auto shuf_B1 = shuf_A2 + k3; // ..4..3.. + const auto shuf_B2 = shuf_A0 + k2; // .4..3..2 + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const V B = BitCast(d, B0 | B1 | B2); + + // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] + const auto shuf_C0 = shuf_B1 + k3; // ..7..6.. + const auto shuf_C1 = shuf_B2 + k3; // .7..6..5 + const auto shuf_C2 = shuf_B0 + k2; // 7..6..5. + const auto C0 = TableLookupBytesOr0(v0, shuf_C0); + const auto C1 = TableLookupBytesOr0(v1, shuf_C1); + const auto C2 = TableLookupBytesOr0(v2, shuf_C2); + const V C = BitCast(d, C0 | C1 | C2); + + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// >= 128-bit vector, 32-bit lanes +template +HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2, + Simd d, T* HWY_RESTRICT unaligned) { + const RepartitionToWide dw; + + const V v10_v00 = InterleaveLower(d, v0, v1); + const V v01_v20 = OddEven(v0, v2); + // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0) + const V A = BitCast( + d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20))); + + const V v1_321 = ShiftRightLanes<1>(d, v1); + const V v0_32 = ShiftRightLanes<2>(d, v0); + const V v21_v11 = OddEven(v2, v1_321); + const V v12_v02 = OddEven(v1_321, v0_32); + // B: v1[2],v0[2], v2[1],v1[1] + const V B = BitCast( + d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02))); + + // Notation refers to the upper 2 lanes of the vector for InterleaveUpper. + const V v23_v13 = OddEven(v2, v1_321); + const V v03_v22 = OddEven(v0, v2); + // C: v2[3],v1[3],v0[3], v2[2] + const V C = BitCast( + d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13))); + + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// >= 128-bit vector, 64-bit lanes +template +HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2, + Simd d, T* HWY_RESTRICT unaligned) { + const V A = InterleaveLower(d, v0, v1); + const V B = OddEven(v0, v2); + const V C = InterleaveUpper(d, v1, v2); + detail::StoreTransposedBlocks3(A, B, C, d, unaligned); +} + +// 64-bit vector, 8-bit lanes +template +HWY_API void StoreInterleaved3(const Vec64 part0, const Vec64 part1, + const Vec64 part2, Full64 d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 16 / sizeof(T); + // Use full vectors for the shuffles and first result. + const Full128 du; + const Full128 d_full; + const auto k5 = Set(du, 5); + const auto k6 = Set(du, 6); + + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const Vec128 v2{part2.raw}; + + // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): + // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be + // filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_v0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + // The interleaved vectors will be named A, B, C; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A0 = Load(du, tbl_v0); + const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB) + const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. + const auto A = BitCast(d_full, A0 | A1 | A2); + StoreU(A, d_full, unaligned + 0 * N); + + // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] + const auto shuf_B0 = shuf_A2 + k6; // ..7..6.. + const auto shuf_B1 = shuf_A0 + k5; // .7..6..5 + const auto shuf_B2 = shuf_A1 + k5; // 7..6..5. + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const Vec64 B{(B0 | B1 | B2).raw}; + StoreU(B, d, unaligned + 1 * N); +} + +// 64-bit vector, 16-bit lanes +template +HWY_API void StoreInterleaved3(const Vec64 part0, const Vec64 part1, + const Vec64 part2, Full64 dh, + T* HWY_RESTRICT unaligned) { + const Full128 d; + const Full128 du8; + constexpr size_t N = 16 / sizeof(T); + const auto k2 = Set(du8, 2 * sizeof(T)); + const auto k3 = Set(du8, 3 * sizeof(T)); + + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const Vec128 v2{part2.raw}; + + // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right): + // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes + // to their place, with 0x80 so lanes to be filled from other vectors are 0 + // to enable blending by ORing together. + alignas(16) static constexpr uint8_t tbl_v1[16] = { + 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, + 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; + alignas(16) static constexpr uint8_t tbl_v2[16] = { + 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, + 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; + + // The interleaved vectors will be named A, B; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0. + // .2..1..0 + const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); + const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0.. + + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); + const Vec128 A = BitCast(d, A0 | A1 | A2); + StoreU(A, d, unaligned + 0 * N); + + // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2] + const auto shuf_B0 = shuf_A1 + k3; // ..3. + const auto shuf_B1 = shuf_A2 + k3; // .3.. + const auto shuf_B2 = shuf_A0 + k2; // 3..2 + const auto B0 = TableLookupBytesOr0(v0, shuf_B0); + const auto B1 = TableLookupBytesOr0(v1, shuf_B1); + const auto B2 = TableLookupBytesOr0(v2, shuf_B2); + const Vec128 B = BitCast(d, B0 | B1 | B2); + StoreU(Vec64{B.raw}, dh, unaligned + 1 * N); +} + +// 64-bit vector, 32-bit lanes +template +HWY_API void StoreInterleaved3(const Vec64 v0, const Vec64 v1, + const Vec64 v2, Full64 d, + T* HWY_RESTRICT unaligned) { + // (same code as 128-bit vector, 64-bit lanes) + constexpr size_t N = 2; + const Vec64 v10_v00 = InterleaveLower(d, v0, v1); + const Vec64 v01_v20 = OddEven(v0, v2); + const Vec64 v21_v11 = InterleaveUpper(d, v1, v2); + StoreU(v10_v00, d, unaligned + 0 * N); + StoreU(v01_v20, d, unaligned + 1 * N); + StoreU(v21_v11, d, unaligned + 2 * N); +} + +// 64-bit lanes are handled by the N=1 case below. + +// <= 32-bit vector, 8-bit lanes +template +HWY_API void StoreInterleaved3(const Vec128 part0, + const Vec128 part1, + const Vec128 part2, Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and result. + const Full128 du; + const Full128 d_full; + + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const Vec128 v2{part2.raw}; + + // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 + // so lanes to be filled from other vectors are 0 to enable blending by ORing + // together. + alignas(16) static constexpr uint8_t tbl_v0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, + 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + // The interleaved vector will be named A; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A0 = Load(du, tbl_v0); + const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0); + const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0.. + const Vec128 A = BitCast(d_full, A0 | A1 | A2); + alignas(16) T buf[16 / sizeof(T)]; + StoreU(A, d_full, buf); + CopyBytes(buf, unaligned); +} + +// 32-bit vector, 16-bit lanes +template +HWY_API void StoreInterleaved3(const Vec128 part0, + const Vec128 part1, + const Vec128 part2, Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 4 / sizeof(T); + // Use full vectors for the shuffles and result. + const Full128 du8; + const Full128 d_full; + + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const Vec128 v2{part2.raw}; + + // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 + // so lanes to be filled from other vectors are 0 to enable blending by ORing + // together. + alignas(16) static constexpr uint8_t tbl_v2[16] = { + 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, + 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; + // The interleaved vector will be named A; temporaries with suffix + // 0..2 indicate which input vector's lanes they hold. + const auto shuf_A2 = // ..1..0.. + Load(du8, tbl_v2); + const auto shuf_A1 = // ...1..0. + CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); + const auto shuf_A0 = // ....1..0 + CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); + const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0 + const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0. + const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0.. + const auto A = BitCast(d_full, A0 | A1 | A2); + alignas(16) T buf[16 / sizeof(T)]; + StoreU(A, d_full, buf); + CopyBytes(buf, unaligned); +} + +// Single-element vector, any lane size: just store directly +template +HWY_API void StoreInterleaved3(const Vec128 v0, const Vec128 v1, + const Vec128 v2, Simd d, + T* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); +} + +// ------------------------------ StoreInterleaved4 + +namespace detail { + +// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. +template +HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, + Simd d, + T* HWY_RESTRICT unaligned) { + StoreU(A, d, unaligned + 0 * N); + StoreU(B, d, unaligned + 1 * N); + StoreU(C, d, unaligned + 2 * N); + StoreU(D, d, unaligned + 3 * N); +} + +} // namespace detail + +// >= 128-bit vector, 8..32-bit lanes +template +HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3, + Simd d, T* HWY_RESTRICT unaligned) { + const RepartitionToWide dw; + const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0] + const auto v32L = ZipLower(dw, v2, v3); + const auto v10U = ZipUpper(dw, v0, v1); + const auto v32U = ZipUpper(dw, v2, v3); + // The interleaved vectors are A, B, C, D. + const auto A = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210 + const auto B = BitCast(d, InterleaveUpper(dw, v10L, v32L)); + const auto C = BitCast(d, InterleaveLower(dw, v10U, v32U)); + const auto D = BitCast(d, InterleaveUpper(dw, v10U, v32U)); + detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned); +} + +// >= 128-bit vector, 64-bit lanes +template +HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3, + Simd d, T* HWY_RESTRICT unaligned) { + // The interleaved vectors are A, B, C, D. + const auto A = InterleaveLower(d, v0, v1); // v1[0] v0[0] + const auto B = InterleaveLower(d, v2, v3); + const auto C = InterleaveUpper(d, v0, v1); + const auto D = InterleaveUpper(d, v2, v3); + detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned); +} + +// 64-bit vector, 8..32-bit lanes +template +HWY_API void StoreInterleaved4(const Vec64 part0, const Vec64 part1, + const Vec64 part2, const Vec64 part3, + Full64 /*tag*/, T* HWY_RESTRICT unaligned) { + constexpr size_t N = 16 / sizeof(T); + // Use full vectors to reduce the number of stores. + const Full128 d_full; + const RepartitionToWide dw; + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const Vec128 v2{part2.raw}; + const Vec128 v3{part3.raw}; + const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0] + const auto v32 = ZipLower(dw, v2, v3); + const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32)); + const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32)); + StoreU(A, d_full, unaligned + 0 * N); + StoreU(B, d_full, unaligned + 1 * N); +} + +// 64-bit vector, 64-bit lane +template +HWY_API void StoreInterleaved4(const Vec64 part0, const Vec64 part1, + const Vec64 part2, const Vec64 part3, + Full64 /*tag*/, T* HWY_RESTRICT unaligned) { + constexpr size_t N = 16 / sizeof(T); + // Use full vectors to reduce the number of stores. + const Full128 d_full; + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const Vec128 v2{part2.raw}; + const Vec128 v3{part3.raw}; + const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0] + const auto B = InterleaveLower(d_full, v2, v3); + StoreU(A, d_full, unaligned + 0 * N); + StoreU(B, d_full, unaligned + 1 * N); +} + +// <= 32-bit vectors +template +HWY_API void StoreInterleaved4(const Vec128 part0, + const Vec128 part1, + const Vec128 part2, + const Vec128 part3, Simd /*tag*/, + T* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Full128 d_full; + const RepartitionToWide dw; + const Vec128 v0{part0.raw}; + const Vec128 v1{part1.raw}; + const Vec128 v2{part2.raw}; + const Vec128 v3{part3.raw}; + const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0] + const auto v32 = ZipLower(dw, v2, v3); + const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32)); + alignas(16) T buf[16 / sizeof(T)]; + StoreU(v3210, d_full, buf); + CopyBytes<4 * N * sizeof(T)>(buf, unaligned); +} + +#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED + +// ------------------------------ AESRound + +// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes. +#if HWY_TARGET != HWY_SCALAR + +// Define for white-box testing, even if native instructions are available. +namespace detail { + +// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with +// Vector Permute Instructions" and the accompanying assembly language +// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan: +// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html . +// +// A brute-force 256 byte table lookup can also be made constant-time, and +// possibly competitive on NEON, but this is more performance-portable +// especially for x86 and large vectors. +template // u8 +HWY_INLINE V SubBytes(V state) { + const DFromV du; + const auto mask = Set(du, 0xF); + + // Change polynomial basis to GF(2^4) + { + alignas(16) static constexpr uint8_t basisL[16] = { + 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2, + 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA}; + alignas(16) static constexpr uint8_t basisU[16] = { + 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C, + 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD}; + const auto sL = And(state, mask); + const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero + const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL); + const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU); + state = Xor(gf4L, gf4U); + } + + // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and + // cause TableLookupBytesOr0 to return 0. + alignas(16) static constexpr uint8_t kZetaInv[16] = { + 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3}; + alignas(16) static constexpr uint8_t kInv[16] = { + 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4}; + const auto tbl = LoadDup128(du, kInv); + const auto sL = And(state, mask); // L=low nibble, U=upper + const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero + const auto sX = Xor(sU, sL); + const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL); + const auto invU = TableLookupBytes(tbl, sU); + const auto invX = TableLookupBytes(tbl, sX); + const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU))); + const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX))); + + // Linear skew (cannot bake 0x63 bias into the table because out* indices + // may have the infinity flag set). + alignas(16) static constexpr uint8_t kAffineL[16] = { + 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0, + 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15}; + alignas(16) static constexpr uint8_t kAffineU[16] = { + 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF, + 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E}; + const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL); + const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU); + return Xor(Xor(affL, affU), Set(du, 0x63)); +} + +} // namespace detail + +#endif // HWY_TARGET != HWY_SCALAR + +// "Include guard": skip if native AES instructions are available. +#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) +#if HWY_TARGET != HWY_SCALAR + +namespace detail { + +template // u8 +HWY_API V ShiftRows(const V state) { + const DFromV du; + alignas(16) static constexpr uint8_t kShiftRow[16] = { + 0, 5, 10, 15, // transposed: state is column major + 4, 9, 14, 3, // + 8, 13, 2, 7, // + 12, 1, 6, 11}; + const auto shift_row = LoadDup128(du, kShiftRow); + return TableLookupBytes(state, shift_row); +} + +template // u8 +HWY_API V MixColumns(const V state) { + const DFromV du; + // For each column, the rows are the sum of GF(2^8) matrix multiplication by: + // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3. + // 1 2 3 1 // d are on diagonal, no permutation needed. + // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows. + // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301). + alignas(16) static constexpr uint8_t k2301[16] = { + 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + alignas(16) static constexpr uint8_t k1230[16] = { + 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12}; + const RebindToSigned di; // can only do signed comparisons + const auto msb = Lt(BitCast(di, state), Zero(di)); + const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B))); + const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8). + const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301)); + const auto d_s2301 = Xor(d, s2301); + const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)} + const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230)); + return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms +} + +} // namespace detail + +template // u8 +HWY_API V AESRound(V state, const V round_key) { + // Intel docs swap the first two steps, but it does not matter because + // ShiftRows is a permutation and SubBytes is independent of lane index. + state = detail::SubBytes(state); + state = detail::ShiftRows(state); + state = detail::MixColumns(state); + state = Xor(state, round_key); // AddRoundKey + return state; +} + +template // u8 +HWY_API V AESLastRound(V state, const V round_key) { + // LIke AESRound, but without MixColumns. + state = detail::SubBytes(state); + state = detail::ShiftRows(state); + state = Xor(state, round_key); // AddRoundKey + return state; +} + +// Constant-time implementation inspired by +// https://www.bearssl.org/constanttime.html, but about half the cost because we +// use 64x64 multiplies and 128-bit XORs. +template +HWY_API V CLMulLower(V a, V b) { + const DFromV d; + static_assert(IsSame, uint64_t>(), "V must be u64"); + const auto k1 = Set(d, 0x1111111111111111ULL); + const auto k2 = Set(d, 0x2222222222222222ULL); + const auto k4 = Set(d, 0x4444444444444444ULL); + const auto k8 = Set(d, 0x8888888888888888ULL); + const auto a0 = And(a, k1); + const auto a1 = And(a, k2); + const auto a2 = And(a, k4); + const auto a3 = And(a, k8); + const auto b0 = And(b, k1); + const auto b1 = And(b, k2); + const auto b2 = And(b, k4); + const auto b3 = And(b, k8); + + auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3)); + auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0)); + auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1)); + auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2)); + m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1))); + m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2))); + m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3))); + m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0))); + return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); +} + +template +HWY_API V CLMulUpper(V a, V b) { + const DFromV d; + static_assert(IsSame, uint64_t>(), "V must be u64"); + const auto k1 = Set(d, 0x1111111111111111ULL); + const auto k2 = Set(d, 0x2222222222222222ULL); + const auto k4 = Set(d, 0x4444444444444444ULL); + const auto k8 = Set(d, 0x8888888888888888ULL); + const auto a0 = And(a, k1); + const auto a1 = And(a, k2); + const auto a2 = And(a, k4); + const auto a3 = And(a, k8); + const auto b0 = And(b, k1); + const auto b1 = And(b, k2); + const auto b2 = And(b, k4); + const auto b3 = And(b, k8); + + auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3)); + auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0)); + auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1)); + auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2)); + m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1))); + m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2))); + m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3))); + m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0))); + return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); +} + +#endif // HWY_NATIVE_AES +#endif // HWY_TARGET != HWY_SCALAR + +// "Include guard": skip if native POPCNT-related instructions are available. +#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +#undef HWY_MIN_POW2_FOR_128 +#if HWY_TARGET == HWY_RVV +#define HWY_MIN_POW2_FOR_128 1 +#else +// All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D) +// guarantee 128 bits anyway. +#define HWY_MIN_POW2_FOR_128 0 +#endif + +// This algorithm requires vectors to be at least 16 bytes, which is the case +// for LMUL >= 2. If not, use the fallback below. +template , HWY_IF_LANE_SIZE_D(D, 1), + HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)> +HWY_API V PopulationCount(V v) { + static_assert(IsSame, uint8_t>(), "V must be u8"); + const D d; + HWY_ALIGN constexpr uint8_t kLookup[16] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + }; + const auto lo = And(v, Set(d, 0xF)); + const auto hi = ShiftRight<4>(v); + const auto lookup = LoadDup128(d, kLookup); + return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo)); +} + +// RVV has a specialization that avoids the Set(). +#if HWY_TARGET != HWY_RVV +// Slower fallback for capped vectors. +template , HWY_IF_LANE_SIZE_D(D, 1), + HWY_IF_LT128_D(D)> +HWY_API V PopulationCount(V v) { + static_assert(IsSame, uint8_t>(), "V must be u8"); + const D d; + // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 + v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55))); + v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33))); + return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F)); +} +#endif // HWY_TARGET != HWY_RVV + +template , HWY_IF_LANE_SIZE_D(D, 2)> +HWY_API V PopulationCount(V v) { + static_assert(IsSame, uint16_t>(), "V must be u16"); + const D d; + const Repartition d8; + const auto vals = BitCast(d, PopulationCount(BitCast(d8, v))); + return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF))); +} + +template , HWY_IF_LANE_SIZE_D(D, 4)> +HWY_API V PopulationCount(V v) { + static_assert(IsSame, uint32_t>(), "V must be u32"); + const D d; + Repartition d16; + auto vals = BitCast(d, PopulationCount(BitCast(d16, v))); + return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF))); +} + +#if HWY_HAVE_INTEGER64 +template , HWY_IF_LANE_SIZE_D(D, 8)> +HWY_API V PopulationCount(V v) { + static_assert(IsSame, uint64_t>(), "V must be u64"); + const D d; + Repartition d32; + auto vals = BitCast(d, PopulationCount(BitCast(d32, v))); + return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF))); +} +#endif + +#endif // HWY_NATIVE_POPCNT + +template , HWY_IF_LANE_SIZE_D(D, 8), + HWY_IF_LT128_D(D)> +HWY_API V operator*(V x, V y) { + return Set(D(), GetLane(x) * GetLane(y)); +} + +// "Include guard": skip if native 64-bit mul instructions are available. +#if (defined(HWY_NATIVE_I64MULLO) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_I64MULLO +#undef HWY_NATIVE_I64MULLO +#else +#define HWY_NATIVE_I64MULLO +#endif + +template , typename T = LaneType, + HWY_IF_LANE_SIZE(T, 8), HWY_IF_UNSIGNED(T), HWY_IF_GE128_D(D64)> +HWY_API V operator*(V x, V y) { + RepartitionToNarrow d32; + auto x32 = BitCast(d32, x); + auto y32 = BitCast(d32, y); + auto lolo = BitCast(d32, MulEven(x32, y32)); + auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y)))); + auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32)); + auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo))); + return BitCast(D64{}, lolo + hi); +} +template , typename T = LaneType, + HWY_IF_LANE_SIZE(T, 8), HWY_IF_SIGNED(T), HWY_IF_GE128_D(DI64)> +HWY_API V operator*(V x, V y) { + RebindToUnsigned du64; + return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y)); +} + +#endif // HWY_NATIVE_I64MULLO + +// ================================================== Operator wrapper + +// These targets currently cannot define operators and have already defined +// (only) the corresponding functions such as Add. +#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \ + HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \ + HWY_TARGET != HWY_SVE2_128 + +template +HWY_API V Add(V a, V b) { + return a + b; +} +template +HWY_API V Sub(V a, V b) { + return a - b; +} + +template +HWY_API V Mul(V a, V b) { + return a * b; +} +template +HWY_API V Div(V a, V b) { + return a / b; +} + +template +V Shl(V a, V b) { + return a << b; +} +template +V Shr(V a, V b) { + return a >> b; +} + +template +HWY_API auto Eq(V a, V b) -> decltype(a == b) { + return a == b; +} +template +HWY_API auto Ne(V a, V b) -> decltype(a == b) { + return a != b; +} +template +HWY_API auto Lt(V a, V b) -> decltype(a == b) { + return a < b; +} + +template +HWY_API auto Gt(V a, V b) -> decltype(a == b) { + return a > b; +} +template +HWY_API auto Ge(V a, V b) -> decltype(a == b) { + return a >= b; +} + +template +HWY_API auto Le(V a, V b) -> decltype(a == b) { + return a <= b; +} + +#endif // HWY_TARGET for operators + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h new file mode 100644 index 0000000..2a8fb52 --- /dev/null +++ b/hwy/ops/rvv-inl.h @@ -0,0 +1,3405 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// RISC-V V vectors (length not known at compile time). +// External include guard in highway.h - see comment there. + +#include +#include +#include + +#include "hwy/base.h" +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +struct DFromV_t {}; // specialized in macros +template +using DFromV = typename DFromV_t>::type; + +template +using TFromV = TFromD>; + +// Enables the overload if Pow2 is in [min, max]. +#define HWY_RVV_IF_POW2_IN(D, min, max) \ + hwy::EnableIf<(min) <= Pow2(D()) && Pow2(D()) <= (max)>* = nullptr + +template +constexpr size_t MLenFromD(Simd /* tag */) { + // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower + // argument enables fractional LMUL < 1. Limit to 64 because that is the + // largest value for which vbool##_t are defined. + return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2)); +} + +// ================================================== MACROS + +// Generate specializations and function definitions using X macros. Although +// harder to read and debug, writing everything manually is too bulky. + +namespace detail { // for code folding + +// For all mask sizes MLEN: (1/Nth of a register, one bit per lane) +// The first two arguments are SEW and SHIFT such that SEW >> SHIFT = MLEN. +#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \ + X_MACRO(64, 0, 64, NAME, OP) \ + X_MACRO(32, 0, 32, NAME, OP) \ + X_MACRO(16, 0, 16, NAME, OP) \ + X_MACRO(8, 0, 8, NAME, OP) \ + X_MACRO(8, 1, 4, NAME, OP) \ + X_MACRO(8, 2, 2, NAME, OP) \ + X_MACRO(8, 3, 1, NAME, OP) + +// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows +// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or +// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix. +// +// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same +// reason, also pass the double-width and half SEW and LMUL (suffixed D and H, +// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8). +// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP + +// LMULS = _TRUNC: truncatable (not the smallest LMUL) +#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) + +// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH. +#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) + +// LMULS = _LE2: <= 2 +#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) + +#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) + +#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) + +// LMULS = _EXT: not the largest LMUL +#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) + +#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) + +// LMULS = _ALL (2^MinPow2() <= LMUL <= 8) +#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) + +// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least +// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even +// though RISC-V LMUL must be at least SEW/64 (notice that this rules out +// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to +// one less than should be supported, with all other parameters (vector type +// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes() +// returns half of what it usually would. +// +// Notice that we can only add overloads whenever there is a D argument: those +// are unique with respect to non-virtual-LMUL overloads because their kPow2 +// template argument differs. Otherwise, there is no actual vuint64mf2_t, and +// defining another overload with the same LMUL would be an error. Thus we have +// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is +// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most +// functions that take a D. + +#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP) + +#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP) + +#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP) + +// ALL + VIRT +#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// LE2 + VIRT +#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// EXT + VIRT +#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// DEMOTE + VIRT +#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ + HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// SEW for unsigned: +#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP) +#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP) +#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP) +#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP) + +// SEW for signed: +#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP) +#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP) +#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP) +#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP) + +// SEW for float: +#if HWY_HAVE_FLOAT16 +#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP) +#else +#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) +#endif +#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP) +#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \ + HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP) + +// Commonly used type/SEW groups: +#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) + +// For all combinations of SEW: +#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) + +// Commonly used type categories: +#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) + +#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \ + HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) + +// Assemble types for use in x-macros +#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t +#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd +#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t +#define HWY_RVV_M(MLEN) vbool##MLEN##_t + +} // namespace detail + +// Until we have full intrinsic support for fractional LMUL, mixed-precision +// code can use LMUL 1..8 (adequate unless they need many registers). +#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template <> \ + struct DFromV_t { \ + using Lane = HWY_RVV_T(BASE, SEW); \ + using type = ScalableTag; \ + }; + +HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL) +#undef HWY_SPECIALIZE + +// ------------------------------ Lanes + +// WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL! +// vlenb is not exposed through intrinsics and vreadvl is not VLMAX. +#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ + size_t actual = v##OP##SEW##LMUL(); \ + /* Common case of full vectors: avoid any extra instructions. */ \ + /* actual includes LMUL, so do not shift again. */ \ + if (detail::IsFull(d)) return actual; \ + /* Check for virtual LMUL, e.g. "uint16mf8_t" (not provided by */ \ + /* intrinsics). In this case the actual LMUL is 1/4, so divide by */ \ + /* another factor of two. */ \ + if (detail::ScaleByPower(128 / SEW, SHIFT) == 1) actual >>= 1; \ + return HWY_MIN(actual, N); \ + } + +HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL_VIRT) +#undef HWY_RVV_LANES + +template +HWY_API size_t Lanes(Simd /* tag*/) { + return Lanes(Simd()); +} + +// ------------------------------ Common x-macros + +// Last argument to most intrinsics. Use when the op has no d arg of its own, +// which means there is no user-specified cap. +#define HWY_RVV_AVL(SEW, SHIFT) \ + Lanes(ScalableTag()) + +// vector = f(vector), e.g. Not +#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// vector = f(vector, scalar), e.g. detail::AddS +#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ + return v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// vector = f(vector, vector), e.g. Add +#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ + return v##OP##_vv_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// mask = f(mask) +#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \ + return vm##OP##_m_b##MLEN(m, ~0ull); \ + } + +// ================================================== INIT + +// ------------------------------ Set + +#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \ + return v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT) +HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT) +#undef HWY_RVV_SET + +// Treat bfloat16_t as uint16_t (using the previously defined Set overloads); +// required for Zero and VFromD. +template +decltype(Set(Simd(), 0)) Set(Simd d, + bfloat16_t arg) { + return Set(RebindToUnsigned(), arg.bits); +} + +template +using VFromD = decltype(Set(D(), TFromD())); + +// ------------------------------ Zero + +template +HWY_API VFromD> Zero(Simd d) { + // Cast to support bfloat16_t. + const RebindToUnsigned du; + return BitCast(d, Set(du, 0)); +} + +// ------------------------------ Undefined + +// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized +// by it gives unpredictable results. It should only be used for maskoff, so +// keep it internal. For the Highway op, just use Zero (single instruction). +namespace detail { +#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \ + return v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \ + } + +HWY_RVV_FOREACH(HWY_RVV_UNDEFINED, Undefined, undefined, _ALL) +#undef HWY_RVV_UNDEFINED +} // namespace detail + +template +HWY_API VFromD Undefined(D d) { + return Zero(d); +} + +// ------------------------------ BitCast + +namespace detail { + +// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.) +#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(v); /* no AVL */ \ + } +HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC) +#undef HWY_RVV_TRUNC + +// Doubles LMUL to `d2` (the arg is only necessary for _VIRT). +#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMULD) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD(v); /* no AVL */ \ + } +HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT) +#undef HWY_RVV_EXT + +// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is +// the same as the actual input type. +#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v; \ + } +HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT) +#undef HWY_RVV_EXT_VIRT + +// For BitCastToByte, the D arg is only to prevent duplicate definitions caused +// by _ALL_VIRT. + +// There is no reinterpret from u8 <-> u8, so just return. +#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ + vuint8##LMUL##_t v) { \ + return v; \ + } \ + template \ + HWY_API vuint8##LMUL##_t BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return v; \ + } + +// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two). +#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ + vint8##LMUL##_t v) { \ + return vreinterpret_v_i8##LMUL##_u8##LMUL(v); \ + } \ + template \ + HWY_API vint8##LMUL##_t BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return vreinterpret_v_u8##LMUL##_i8##LMUL(v); \ + } + +// Separate u/i because clang only provides signed <-> unsigned reinterpret for +// the same SEW. +#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \ + } \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \ + } + +// Signed/Float: first cast to/from unsigned +#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ + v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \ + } \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ + return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ + v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \ + } + +// Additional versions for virtual LMUL using LMULH for byte vectors. +#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API vuint8##LMULH##_t BitCastToByte(Simd /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return detail::Trunc(v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \ + } \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ + HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ + const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ + return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \ + } + +// Signed/Float: first cast to/from unsigned +#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API vuint8##LMULH##_t BitCastToByte(Simd /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return detail::Trunc(v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ + v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \ + } \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ + HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ + const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ + return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ + v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \ + } + +HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL) +HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL) +HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL) +HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL) +HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT) +HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) +HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) + +#undef HWY_RVV_CAST_U8 +#undef HWY_RVV_CAST_I8 +#undef HWY_RVV_CAST_U +#undef HWY_RVV_CAST_IF +#undef HWY_RVV_CAST_VIRT_U +#undef HWY_RVV_CAST_VIRT_IF + +template +HWY_INLINE VFromD> BitCastFromByte( + Simd /* d */, VFromD> v) { + return BitCastFromByte(Simd(), v); +} + +} // namespace detail + +template +HWY_API VFromD BitCast(D d, FromV v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(d, v)); +} + +namespace detail { + +template >> +HWY_INLINE VFromD BitCastToUnsigned(V v) { + return BitCast(DU(), v); +} + +} // namespace detail + +// ------------------------------ Iota + +namespace detail { + +#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ + return v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \ + } + +HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT) +#undef HWY_RVV_IOTA + +template > +HWY_INLINE VFromD Iota0(const D /*d*/) { + return BitCastToUnsigned(Iota0(DU())); +} + +} // namespace detail + +// ================================================== LOGICAL + +// ------------------------------ Not + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGV, Not, not, _ALL) + +template +HWY_API V Not(const V v) { + using DF = DFromV; + using DU = RebindToUnsigned; + return BitCast(DF(), Not(BitCast(DU(), v))); +} + +// ------------------------------ And + +// Non-vector version (ideally immediate) for use with Iota0 +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, And, and, _ALL) + +template +HWY_API V And(const V a, const V b) { + using DF = DFromV; + using DU = RebindToUnsigned; + return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b))); +} + +// ------------------------------ Or + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Or, or, _ALL) + +template +HWY_API V Or(const V a, const V b) { + using DF = DFromV; + using DU = RebindToUnsigned; + return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b))); +} + +// ------------------------------ Xor + +// Non-vector version (ideally immediate) for use with Iota0 +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Xor, xor, _ALL) + +template +HWY_API V Xor(const V a, const V b) { + using DF = DFromV; + using DU = RebindToUnsigned; + return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b))); +} + +// ------------------------------ AndNot + +template +HWY_API V AndNot(const V not_a, const V b) { + return And(Not(not_a), b); +} + +// ------------------------------ Or3 + +template +HWY_API V Or3(V o1, V o2, V o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd + +template +HWY_API V OrAnd(const V o, const V a1, const V a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ CopySign + +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, CopySign, fsgnj, _ALL) + +template +HWY_API V CopySignToAbs(const V abs, const V sign) { + // RVV can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Add + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL) +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Add, fadd, _ALL) + +// ------------------------------ Sub +HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL) + +// ------------------------------ SaturatedAdd + +HWY_RVV_FOREACH_U08(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL) +HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL) + +HWY_RVV_FOREACH_I08(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL) +HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL) + +// ------------------------------ SaturatedSub + +HWY_RVV_FOREACH_U08(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL) +HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL) + +HWY_RVV_FOREACH_I08(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL) +HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL) + +// ------------------------------ AverageRound + +// TODO(janwas): check vxrm rounding mode +HWY_RVV_FOREACH_U08(HWY_RVV_RETV_ARGVV, AverageRound, aaddu, _ALL) +HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, AverageRound, aaddu, _ALL) + +// ------------------------------ ShiftLeft[Same] + +// Intrinsics do not define .vi forms, so use .vx instead. +#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, HWY_RVV_AVL(SEW, SHIFT)); \ + } \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ + return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast(bits), \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll, _ALL) + +// ------------------------------ ShiftRight[Same] + +HWY_RVV_FOREACH_U(HWY_RVV_SHIFT, ShiftRight, srl, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL) + +#undef HWY_RVV_SHIFT + +// ------------------------------ SumsOf8 (ShiftRight, Add) +template +HWY_API VFromD>> SumsOf8(const VU8 v) { + const DFromV du8; + const RepartitionToWide du16; + const RepartitionToWide du32; + const RepartitionToWide du64; + using VU16 = VFromD; + + const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); + const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF); + const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); + + const VU16 szz_FE_zz_BA_zz_76_zz_32 = + BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); + const VU16 sxx_FC_xx_B8_xx_74_xx_30 = + Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); + const VU16 szz_zz_xx_FC_zz_zz_xx_74 = + BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); + const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = + Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); + return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull); +} + +// ------------------------------ RotateRight +template +HWY_API V RotateRight(const V v) { + constexpr size_t kSizeInBits = sizeof(TFromV) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +} + +// ------------------------------ Shl +#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, bits, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll, _ALL) + +#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits), \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll, _ALL) + +// ------------------------------ Shr + +HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL) + +#undef HWY_RVV_SHIFT_II +#undef HWY_RVV_SHIFT_VV + +// ------------------------------ Min + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Min, minu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Min, min, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Min, fmin, _ALL) + +// ------------------------------ Max + +namespace detail { + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL) + +} // namespace detail + +HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Max, maxu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Max, max, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL) + +// ------------------------------ Mul + +HWY_RVV_FOREACH_UI163264(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL) + +// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*. +#ifdef HWY_NATIVE_I64MULLO +#undef HWY_NATIVE_I64MULLO +#else +#define HWY_NATIVE_I64MULLO +#endif + +// ------------------------------ MulHigh + +// Only for internal use (Highway only promises MulHigh for 16-bit inputs). +// Used by MulEven; vwmul does not work for m8. +namespace detail { +HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL) +HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) +HWY_RVV_FOREACH_U64(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) +} // namespace detail + +HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) +HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL) + +// ------------------------------ MulFixedPoint15 +HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulFixedPoint15, smul, _ALL) + +// ------------------------------ Div +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL) + +// ------------------------------ ApproximateReciprocal +HWY_RVV_FOREACH_F32(HWY_RVV_RETV_ARGV, ApproximateReciprocal, frec7, _ALL) + +// ------------------------------ Sqrt +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, Sqrt, fsqrt, _ALL) + +// ------------------------------ ApproximateReciprocalSqrt +HWY_RVV_FOREACH_F32(HWY_RVV_RETV_ARGV, ApproximateReciprocalSqrt, frsqrt7, _ALL) + +// ------------------------------ MulAdd +// Note: op is still named vv, not vvv. +#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \ + HWY_RVV_V(BASE, SEW, LMUL) add) { \ + return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL) + +// ------------------------------ NegMulAdd +HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL) + +// ------------------------------ MulSub +HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL) + +// ------------------------------ NegMulSub +HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL) + +#undef HWY_RVV_FMA + +// ================================================== COMPARE + +// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in +// vboolXX_t is a power of two divisor for vector bits. SLEN 8 / LMUL 1 = 1/8th +// of all bits; SLEN 8 / LMUL 4 = half of all bits. + +// mask = f(vector, vector) +#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ + return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// mask = f(vector, scalar) +#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ + return v##OP##_##CHAR##SEW##LMUL##_b##MLEN(a, b, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +// ------------------------------ Eq +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL) + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL) +} // namespace detail + +// ------------------------------ Ne +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL) + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL) +} // namespace detail + +// ------------------------------ Lt +HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL) + +namespace detail { +HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL) +HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL) +} // namespace detail + +// ------------------------------ Le +HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL) + +#undef HWY_RVV_RETM_ARGVV +#undef HWY_RVV_RETM_ARGVS + +// ------------------------------ Gt/Ge + +template +HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) { + return Le(b, a); +} + +template +HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) { + return Lt(b, a); +} + +// ------------------------------ TestBit +template +HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) { + return detail::NeS(And(a, bit), 0); +} + +// ------------------------------ Not +// NOLINTNEXTLINE +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not ) + +// ------------------------------ And + +// mask = f(mask_a, mask_b) (note arg2,arg1 order!) +#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \ + return vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, And, and) + +// ------------------------------ AndNot +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, AndNot, andn) + +// ------------------------------ Or +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or) + +// ------------------------------ Xor +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor) + +// ------------------------------ ExclusiveNeither +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor) + +#undef HWY_RVV_RETM_ARGMM + +// ------------------------------ IfThenElse +#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ + HWY_RVV_V(BASE, SEW, LMUL) no) { \ + return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge, _ALL) + +#undef HWY_RVV_IF_THEN_ELSE + +// ------------------------------ IfThenElseZero +template +HWY_API V IfThenElseZero(const M mask, const V yes) { + return IfThenElse(mask, yes, Zero(DFromV())); +} + +// ------------------------------ IfThenZeroElse + +#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ + LMULH, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \ + return v##OP##_##CHAR##SEW##LMUL(m, no, 0, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, merge_vxm, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL) + +#undef HWY_RVV_IF_THEN_ZERO_ELSE + +// ------------------------------ MaskFromVec + +template +HWY_API auto MaskFromVec(const V v) -> decltype(Eq(v, v)) { + return detail::NeS(v, 0); +} + +template +using MFromD = decltype(MaskFromVec(Zero(D()))); + +template +HWY_API MFromD RebindMask(const D /*d*/, const MFrom mask) { + // No need to check lane size/LMUL are the same: if not, casting MFrom to + // MFromD would fail. + return mask; +} + +// ------------------------------ VecFromMask + +namespace detail { +#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_M(MLEN) m) { \ + return v##OP##_##CHAR##SEW##LMUL##_m(m, v0, v0, 1, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, SubS, sub_vx, _ALL) +#undef HWY_RVV_VEC_FROM_MASK +} // namespace detail + +template +HWY_API VFromD VecFromMask(const D d, MFromD mask) { + return detail::SubS(Zero(d), mask); +} + +template +HWY_API VFromD VecFromMask(const D d, MFromD mask) { + return BitCast(d, VecFromMask(RebindToUnsigned(), mask)); +} + +// ------------------------------ IfVecThenElse (MaskFromVec) + +template +HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ ZeroIfNegative +template +HWY_API V ZeroIfNegative(const V v) { + return IfThenZeroElse(detail::LtS(v, 0), v); +} + +// ------------------------------ BroadcastSignBit +template +HWY_API V BroadcastSignBit(const V v) { + return ShiftRight) * 8 - 1>(v); +} + +// ------------------------------ IfNegativeThenElse (BroadcastSignBit) +template +HWY_API V IfNegativeThenElse(V v, V yes, V no) { + static_assert(IsSigned>(), "Only works for signed/float"); + const DFromV d; + const RebindToSigned di; + + MFromD m = + MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); + return IfThenElse(m, yes, no); +} + +// ------------------------------ FindFirstTrue + +#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return vfirst_m_b##MLEN(m, Lanes(d)); \ + } \ + template \ + HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return static_cast(vfirst_m_b##MLEN(m, Lanes(d))); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _) +#undef HWY_RVV_FIND_FIRST_TRUE + +// ------------------------------ AllFalse +template +HWY_API bool AllFalse(D d, MFromD m) { + return FindFirstTrue(d, m) < 0; +} + +// ------------------------------ AllTrue + +#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return AllFalse(d, vmnot_m_b##MLEN(m, Lanes(d))); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _) +#undef HWY_RVV_ALL_TRUE + +// ------------------------------ CountTrue + +#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \ + static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ + return vcpop_m_b##MLEN(m, Lanes(d)); \ + } + +HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _) +#undef HWY_RVV_COUNT_TRUE + +// ================================================== MEMORY + +// ------------------------------ Load + +#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT) +#undef HWY_RVV_LOAD + +// There is no native BF16, treat as uint16_t. +template +HWY_API VFromD> Load( + Simd d, const bfloat16_t* HWY_RESTRICT p) { + return Load(RebindToUnsigned(), + reinterpret_cast(p)); +} + +template +HWY_API void Store(VFromD> v, + Simd d, bfloat16_t* HWY_RESTRICT p) { + Store(v, RebindToUnsigned(), + reinterpret_cast(p)); +} + +// ------------------------------ LoadU + +// RVV only requires lane alignment, not natural alignment of the entire vector. +template +HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { + return Load(d, p); +} + +// ------------------------------ MaskedLoad + +#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, Zero(d), p, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT) +#undef HWY_RVV_MASKED_LOAD + +// ------------------------------ Store + +#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT) +#undef HWY_RVV_STORE + +// ------------------------------ BlendedStore + +#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT) +#undef HWY_RVV_BLENDED_STORE + +namespace detail { + +#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count); \ + } +HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT) +#undef HWY_RVV_STOREN + +} // namespace detail + +// ------------------------------ StoreU + +// RVV only requires lane alignment, not natural alignment of the entire vector. +template +HWY_API void StoreU(const V v, D d, TFromD* HWY_RESTRICT p) { + Store(v, d, p); +} + +// ------------------------------ Stream +template +HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) { + Store(v, d, aligned); +} + +// ------------------------------ ScatterOffset + +#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ + HWY_RVV_V(int, SEW, LMUL) offset) { \ + return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ + base, detail::BitCastToUnsigned(offset), v, Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT) +#undef HWY_RVV_SCATTER + +// ------------------------------ ScatterIndex + +template +HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, + const VFromD> index) { + return ScatterOffset(v, d, base, ShiftLeft<2>(index)); +} + +template +HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, + const VFromD> index) { + return ScatterOffset(v, d, base, ShiftLeft<3>(index)); +} + +// ------------------------------ GatherOffset + +#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ + HWY_RVV_V(int, SEW, LMUL) offset) { \ + return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ + base, detail::BitCastToUnsigned(offset), Lanes(d)); \ + } +HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT) +#undef HWY_RVV_GATHER + +// ------------------------------ GatherIndex + +template +HWY_API VFromD GatherIndex(D d, const TFromD* HWY_RESTRICT base, + const VFromD> index) { + return GatherOffset(d, base, ShiftLeft<2>(index)); +} + +template +HWY_API VFromD GatherIndex(D d, const TFromD* HWY_RESTRICT base, + const VFromD> index) { + return GatherOffset(d, base, ShiftLeft<3>(index)); +} + +// ------------------------------ LoadInterleaved2 + +// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ + HWY_RVV_V(BASE, SEW, LMUL) & v0, \ + HWY_RVV_V(BASE, SEW, LMUL) & v1) { \ + v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, unaligned, Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_LOAD2, LoadInterleaved2, lseg2, _LE2_VIRT) +#undef HWY_RVV_LOAD2 + +// ------------------------------ LoadInterleaved3 + +#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ + HWY_RVV_V(BASE, SEW, LMUL) & v0, \ + HWY_RVV_V(BASE, SEW, LMUL) & v1, \ + HWY_RVV_V(BASE, SEW, LMUL) & v2) { \ + v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, unaligned, Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_LOAD3, LoadInterleaved3, lseg3, _LE2_VIRT) +#undef HWY_RVV_LOAD3 + +// ------------------------------ LoadInterleaved4 + +#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned, \ + HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \ + HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \ + v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, &v3, aligned, \ + Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_LOAD4, LoadInterleaved4, lseg4, _LE2_VIRT) +#undef HWY_RVV_LOAD4 + +// ------------------------------ StoreInterleaved2 + +#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \ + HWY_RVV_V(BASE, SEW, LMUL) v1, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ + v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_STORE2, StoreInterleaved2, sseg2, _LE2_VIRT) +#undef HWY_RVV_STORE2 + +// ------------------------------ StoreInterleaved3 + +#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME( \ + HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ + HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ + v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, v2, Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_STORE3, StoreInterleaved3, sseg3, _LE2_VIRT) +#undef HWY_RVV_STORE3 + +// ------------------------------ StoreInterleaved4 + +#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API void NAME( \ + HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ + HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \ + v##OP##e##SEW##_v_##CHAR##SEW##LMUL(aligned, v0, v1, v2, v3, Lanes(d)); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT) +#undef HWY_RVV_STORE4 + +// ================================================== CONVERT + +// ------------------------------ PromoteTo + +// SEW is for the input so we can use F16 (no-op if not supported). +#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ + HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return OP##CHAR##SEWD##LMULD(v, Lanes(d)); \ + } + +HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT) +HWY_RVV_FOREACH_F16(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT) +HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT) +#undef HWY_RVV_PROMOTE + +// The above X-macro cannot handle 4x promotion nor type switching. +// TODO(janwas): use BASE2 arg to allow the latter. +#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \ + SHIFT, ADD) \ + template \ + HWY_API HWY_RVV_V(BASE, BITS, LMUL) \ + PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \ + HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \ + return OP##CHAR##BITS##LMUL(v, Lanes(d)); \ + } + +#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1) + +#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2) + +HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8) +HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8) + +// i32 to f64 +HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32) + +#undef HWY_RVV_PROMOTE_X4 +#undef HWY_RVV_PROMOTE_X2 +#undef HWY_RVV_PROMOTE + +// Unsigned to signed: cast for unsigned promote. +template +HWY_API auto PromoteTo(Simd d, + VFromD> v) + -> VFromD { + return BitCast(d, PromoteTo(RebindToUnsigned(), v)); +} + +template +HWY_API auto PromoteTo(Simd d, + VFromD> v) + -> VFromD { + return BitCast(d, PromoteTo(RebindToUnsigned(), v)); +} + +template +HWY_API auto PromoteTo(Simd d, + VFromD> v) + -> VFromD { + return BitCast(d, PromoteTo(RebindToUnsigned(), v)); +} + +template +HWY_API auto PromoteTo(Simd d, + VFromD> v) + -> VFromD { + const RebindToSigned di32; + const Rebind du16; + return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ------------------------------ DemoteTo U + +// SEW is for the source so we can use _DEMOTE. +#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ + HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return OP##CHAR##SEWH##LMULH(v, 0, Lanes(d)); \ + } \ + template \ + HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME##Shr16( \ + HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \ + } + +// Unsigned -> unsigned (also used for bf16) +namespace detail { +HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT) +HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT) +} // namespace detail + +// SEW is for the source so we can use _DEMOTE. +#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \ + HWY_RVV_D(uint, SEWH, N, SHIFT - 1) d, HWY_RVV_V(int, SEW, LMUL) v) { \ + /* First clamp negative numbers to zero to match x86 packus. */ \ + return detail::DemoteTo(d, detail::BitCastToUnsigned(detail::MaxS(v, 0))); \ + } +HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) +HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) +#undef HWY_RVV_DEMOTE_I_TO_U + +template +HWY_API vuint8mf8_t DemoteTo(Simd d, const vint32mf2_t v) { + return vnclipu_wx_u8mf8(DemoteTo(Simd(), v), 0, Lanes(d)); +} +template +HWY_API vuint8mf4_t DemoteTo(Simd d, const vint32m1_t v) { + return vnclipu_wx_u8mf4(DemoteTo(Simd(), v), 0, Lanes(d)); +} +template +HWY_API vuint8mf2_t DemoteTo(Simd d, const vint32m2_t v) { + return vnclipu_wx_u8mf2(DemoteTo(Simd(), v), 0, Lanes(d)); +} +template +HWY_API vuint8m1_t DemoteTo(Simd d, const vint32m4_t v) { + return vnclipu_wx_u8m1(DemoteTo(Simd(), v), 0, Lanes(d)); +} +template +HWY_API vuint8m2_t DemoteTo(Simd d, const vint32m8_t v) { + return vnclipu_wx_u8m2(DemoteTo(Simd(), v), 0, Lanes(d)); +} + +HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) { + const size_t avl = Lanes(ScalableTag()); + return vnclipu_wx_u8mf8(vnclipu_wx_u16mf4(v, 0, avl), 0, avl); +} +HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) { + const size_t avl = Lanes(ScalableTag()); + return vnclipu_wx_u8mf4(vnclipu_wx_u16mf2(v, 0, avl), 0, avl); +} +HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) { + const size_t avl = Lanes(ScalableTag()); + return vnclipu_wx_u8mf2(vnclipu_wx_u16m1(v, 0, avl), 0, avl); +} +HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) { + const size_t avl = Lanes(ScalableTag()); + return vnclipu_wx_u8m1(vnclipu_wx_u16m2(v, 0, avl), 0, avl); +} +HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) { + const size_t avl = Lanes(ScalableTag()); + return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl); +} + +// ------------------------------ Truncations + +template +HWY_API vuint8mf8_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m1_t v1 = vand(v, 0xFF, avl); + const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl); + const vuint16mf4_t v3 = vnclipu_wx_u16mf4(v2, 0, avl); + return vnclipu_wx_u8mf8(v3, 0, avl); +} + +template +HWY_API vuint8mf4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m2_t v1 = vand(v, 0xFF, avl); + const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl); + const vuint16mf2_t v3 = vnclipu_wx_u16mf2(v2, 0, avl); + return vnclipu_wx_u8mf4(v3, 0, avl); +} + +template +HWY_API vuint8mf2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m4_t v1 = vand(v, 0xFF, avl); + const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl); + const vuint16m1_t v3 = vnclipu_wx_u16m1(v2, 0, avl); + return vnclipu_wx_u8mf2(v3, 0, avl); +} + +template +HWY_API vuint8m1_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m8_t v1 = vand(v, 0xFF, avl); + const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl); + const vuint16m2_t v3 = vnclipu_wx_u16m2(v2, 0, avl); + return vnclipu_wx_u8m1(v3, 0, avl); +} + +template +HWY_API vuint16mf4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m1_t v1 = vand(v, 0xFFFF, avl); + const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl); + return vnclipu_wx_u16mf4(v2, 0, avl); +} + +template +HWY_API vuint16mf2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m2_t v1 = vand(v, 0xFFFF, avl); + const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl); + return vnclipu_wx_u16mf2(v2, 0, avl); +} + +template +HWY_API vuint16m1_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m4_t v1 = vand(v, 0xFFFF, avl); + const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl); + return vnclipu_wx_u16m1(v2, 0, avl); +} + +template +HWY_API vuint16m2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m8_t v1 = vand(v, 0xFFFF, avl); + const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl); + return vnclipu_wx_u16m2(v2, 0, avl); +} + +template +HWY_API vuint32mf2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m1_t v1 = vand(v, 0xFFFFFFFFu, avl); + return vnclipu_wx_u32mf2(v1, 0, avl); +} + +template +HWY_API vuint32m1_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m2_t v1 = vand(v, 0xFFFFFFFFu, avl); + return vnclipu_wx_u32m1(v1, 0, avl); +} + +template +HWY_API vuint32m2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m4_t v1 = vand(v, 0xFFFFFFFFu, avl); + return vnclipu_wx_u32m2(v1, 0, avl); +} + +template +HWY_API vuint32m4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint64m8_t v1 = vand(v, 0xFFFFFFFFu, avl); + return vnclipu_wx_u32m4(v1, 0, avl); +} + +template +HWY_API vuint8mf8_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32mf2_t v1 = vand(v, 0xFF, avl); + const vuint16mf4_t v2 = vnclipu_wx_u16mf4(v1, 0, avl); + return vnclipu_wx_u8mf8(v2, 0, avl); +} + +template +HWY_API vuint8mf4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m1_t v1 = vand(v, 0xFF, avl); + const vuint16mf2_t v2 = vnclipu_wx_u16mf2(v1, 0, avl); + return vnclipu_wx_u8mf4(v2, 0, avl); +} + +template +HWY_API vuint8mf2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m2_t v1 = vand(v, 0xFF, avl); + const vuint16m1_t v2 = vnclipu_wx_u16m1(v1, 0, avl); + return vnclipu_wx_u8mf2(v2, 0, avl); +} + +template +HWY_API vuint8m1_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m4_t v1 = vand(v, 0xFF, avl); + const vuint16m2_t v2 = vnclipu_wx_u16m2(v1, 0, avl); + return vnclipu_wx_u8m1(v2, 0, avl); +} + +template +HWY_API vuint8m2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m8_t v1 = vand(v, 0xFF, avl); + const vuint16m4_t v2 = vnclipu_wx_u16m4(v1, 0, avl); + return vnclipu_wx_u8m2(v2, 0, avl); +} + +template +HWY_API vuint16mf4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32mf2_t v1 = vand(v, 0xFFFF, avl); + return vnclipu_wx_u16mf4(v1, 0, avl); +} + +template +HWY_API vuint16mf2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m1_t v1 = vand(v, 0xFFFF, avl); + return vnclipu_wx_u16mf2(v1, 0, avl); +} + +template +HWY_API vuint16m1_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m2_t v1 = vand(v, 0xFFFF, avl); + return vnclipu_wx_u16m1(v1, 0, avl); +} + +template +HWY_API vuint16m2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m4_t v1 = vand(v, 0xFFFF, avl); + return vnclipu_wx_u16m2(v1, 0, avl); +} + +template +HWY_API vuint16m4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint32m8_t v1 = vand(v, 0xFFFF, avl); + return vnclipu_wx_u16m4(v1, 0, avl); +} + +template +HWY_API vuint8mf8_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint16mf4_t v1 = vand(v, 0xFF, avl); + return vnclipu_wx_u8mf8(v1, 0, avl); +} + +template +HWY_API vuint8mf4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint16mf2_t v1 = vand(v, 0xFF, avl); + return vnclipu_wx_u8mf4(v1, 0, avl); +} + +template +HWY_API vuint8mf2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint16m1_t v1 = vand(v, 0xFF, avl); + return vnclipu_wx_u8mf2(v1, 0, avl); +} + +template +HWY_API vuint8m1_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint16m2_t v1 = vand(v, 0xFF, avl); + return vnclipu_wx_u8m1(v1, 0, avl); +} + +template +HWY_API vuint8m2_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint16m4_t v1 = vand(v, 0xFF, avl); + return vnclipu_wx_u8m2(v1, 0, avl); +} + +template +HWY_API vuint8m4_t TruncateTo(Simd d, + const VFromD> v) { + const size_t avl = Lanes(d); + const vuint16m8_t v1 = vand(v, 0xFF, avl); + return vnclipu_wx_u8m4(v1, 0, avl); +} + +// ------------------------------ DemoteTo I + +HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT) +HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT) + +template +HWY_API vint8mf8_t DemoteTo(Simd d, const vint32mf2_t v) { + return DemoteTo(d, DemoteTo(Simd(), v)); +} +template +HWY_API vint8mf4_t DemoteTo(Simd d, const vint32m1_t v) { + return DemoteTo(d, DemoteTo(Simd(), v)); +} +template +HWY_API vint8mf2_t DemoteTo(Simd d, const vint32m2_t v) { + return DemoteTo(d, DemoteTo(Simd(), v)); +} +template +HWY_API vint8m1_t DemoteTo(Simd d, const vint32m4_t v) { + return DemoteTo(d, DemoteTo(Simd(), v)); +} +template +HWY_API vint8m2_t DemoteTo(Simd d, const vint32m8_t v) { + return DemoteTo(d, DemoteTo(Simd(), v)); +} + +#undef HWY_RVV_DEMOTE + +// ------------------------------ DemoteTo F + +// SEW is for the source so we can use _DEMOTE. +#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ + HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return OP##SEWH##LMULH(v, Lanes(d)); \ + } + +#if HWY_HAVE_FLOAT16 +HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f, + _DEMOTE_VIRT) +#endif +HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f, + _DEMOTE_VIRT) +#undef HWY_RVV_DEMOTE_F + +// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F. +template +HWY_API vint32mf2_t DemoteTo(Simd d, const vfloat64m1_t v) { + return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); +} +template +HWY_API vint32mf2_t DemoteTo(Simd d, const vfloat64m1_t v) { + return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); +} +template +HWY_API vint32m1_t DemoteTo(Simd d, const vfloat64m2_t v) { + return vfncvt_rtz_x_f_w_i32m1(v, Lanes(d)); +} +template +HWY_API vint32m2_t DemoteTo(Simd d, const vfloat64m4_t v) { + return vfncvt_rtz_x_f_w_i32m2(v, Lanes(d)); +} +template +HWY_API vint32m4_t DemoteTo(Simd d, const vfloat64m8_t v) { + return vfncvt_rtz_x_f_w_i32m4(v, Lanes(d)); +} + +template +HWY_API VFromD> DemoteTo( + Simd d, VFromD> v) { + const RebindToUnsigned du16; + const Rebind du32; + return detail::DemoteToShr16(du16, BitCast(du32, v)); +} + +// ------------------------------ ConvertTo F + +#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \ + return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \ + } \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ + HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) {\ + return vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \ + } \ + /* Truncates (rounds toward zero). */ \ + template \ + HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \ + } \ +// API only requires f32 but we provide f64 for internal use. +HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT) +#undef HWY_RVV_CONVERT + +// Uses default rounding mode. Must be separate because there is no D arg. +#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ + } +HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL) +#undef HWY_RVV_NEAREST + +// ================================================== COMBINE + +namespace detail { + +// For x86-compatible behaviour mandated by Highway API: TableLookupBytes +// offsets are implicitly relative to the start of their 128-bit block. +template +size_t LanesPerBlock(Simd d) { + size_t lpb = 16 / sizeof(T); + if (IsFull(d)) return lpb; + // Also honor the user-specified (constexpr) N limit. + lpb = HWY_MIN(lpb, N); + // No fraction, we're done. + if (kPow2 >= 0) return lpb; + // Fractional LMUL: Lanes(d) may be smaller than lpb, so honor that. + return HWY_MIN(lpb, Lanes(d)); +} + +template +HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) { + using T = MakeUnsigned>; + return AndS(iota0, static_cast(~(LanesPerBlock(d) - 1))); +} + +template +HWY_INLINE MFromD FirstNPerBlock(D /* tag */) { + const RebindToUnsigned du; + const RebindToSigned di; + using TU = TFromD; + const auto idx_mod = AndS(Iota0(du), static_cast(LanesPerBlock(du) - 1)); + return LtS(BitCast(di, idx_mod), static_cast>(kLanes)); +} + +// vector = f(vector, vector, size_t) +#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \ + size_t lanes) { \ + return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \ + HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideUp, slideup, _ALL) +HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideDown, slidedown, _ALL) + +#undef HWY_RVV_SLIDE + +} // namespace detail + +// ------------------------------ ConcatUpperLower +template +HWY_API V ConcatUpperLower(D d, const V hi, const V lo) { + return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); +} + +// ------------------------------ ConcatLowerLower +template +HWY_API V ConcatLowerLower(D d, const V hi, const V lo) { + return detail::SlideUp(lo, hi, Lanes(d) / 2); +} + +// ------------------------------ ConcatUpperUpper +template +HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) { + // Move upper half into lower + const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2); + return ConcatUpperLower(d, hi, lo_down); +} + +// ------------------------------ ConcatLowerUpper +template +HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { + // Move half of both inputs to the other half + const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2); + const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2); + return ConcatUpperLower(d, hi_up, lo_down); +} + +// ------------------------------ Combine +template +HWY_API VFromD Combine(D2 d2, const V hi, const V lo) { + return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi), + Lanes(d2) / 2); +} + +// ------------------------------ ZeroExtendVector + +template +HWY_API VFromD ZeroExtendVector(D2 d2, const V lo) { + return Combine(d2, Xor(lo, lo), lo); +} + +// ------------------------------ Lower/UpperHalf + +namespace detail { + +// RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note +// that SEW = sizeof(T)*8 and LMUL = 1 << Pow2(). +template +constexpr bool IsSupportedLMUL(D d) { + return (size_t{1} << (Pow2(d) + 3)) >= sizeof(TFromD); +} + +} // namespace detail + +// If IsSupportedLMUL, just 'truncate' i.e. halve LMUL. +template * = nullptr> +HWY_API VFromD LowerHalf(const DH /* tag */, const VFromD> v) { + return detail::Trunc(v); +} + +// Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and +// the hardware may set "vill" if we attempt such an LMUL. However, the V +// extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it +// still makes sense to have half of an SEW=64 vector. We instead just return +// the vector, and rely on the kPow2 in DH to halve the return value of Lanes(). +template * = nullptr> +HWY_API V LowerHalf(const DH /* tag */, const V v) { + return v; +} + +// Same, but without D arg +template +HWY_API VFromD>> LowerHalf(const V v) { + return LowerHalf(Half>(), v); +} + +template +HWY_API VFromD UpperHalf(const DH d2, const VFromD> v) { + return LowerHalf(d2, detail::SlideDown(v, v, Lanes(d2))); +} + +// ================================================== SWIZZLE + +namespace detail { +// Special instruction for 1 lane is presumably faster? +#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL) +HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL) +HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL) +HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL) +#undef HWY_RVV_SLIDE1 +} // namespace detail + +// ------------------------------ GetLane + +#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \ + } + +HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetLane, fmv_f, _ALL) +#undef HWY_RVV_GET_LANE + +// ------------------------------ ExtractLane +template +HWY_API TFromV ExtractLane(const V v, size_t i) { + return GetLane(detail::SlideDown(v, v, i)); +} + +// ------------------------------ InsertLane + +template +HWY_API V InsertLane(const V v, size_t i, TFromV t) { + const DFromV d; + const RebindToUnsigned du; // Iota0 is unsigned only + using TU = TFromD; + const auto is_i = detail::EqS(detail::Iota0(du), static_cast(i)); + return IfThenElse(RebindMask(d, is_i), Set(d, t), v); +} + +namespace detail { +HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof) +} // namespace detail + +// For 8-bit lanes, Iota0 might overflow. +template +HWY_API V InsertLane(const V v, size_t i, TFromV t) { + const DFromV d; + const auto zero = Zero(d); + const auto one = Set(d, 1); + const auto ge_i = Eq(detail::SlideUp(zero, one, i), one); + const auto is_i = detail::SetOnlyFirst(ge_i); + return IfThenElse(RebindMask(d, is_i), Set(d, t), v); +} + +// ------------------------------ OddEven +template +HWY_API V OddEven(const V a, const V b) { + const RebindToUnsigned> du; // Iota0 is unsigned only + const auto is_even = detail::EqS(detail::AndS(detail::Iota0(du), 1), 0); + return IfThenElse(is_even, b, a); +} + +// ------------------------------ DupEven (OddEven) +template +HWY_API V DupEven(const V v) { + const V up = detail::Slide1Up(v); + return OddEven(up, v); +} + +// ------------------------------ DupOdd (OddEven) +template +HWY_API V DupOdd(const V v) { + const V down = detail::Slide1Down(v); + return OddEven(v, down); +} + +// ------------------------------ OddEvenBlocks +template +HWY_API V OddEvenBlocks(const V a, const V b) { + const RebindToUnsigned> du; // Iota0 is unsigned only + constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV)); + const auto idx_block = ShiftRight(detail::Iota0(du)); + const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0); + return IfThenElse(is_even, b, a); +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API V SwapAdjacentBlocks(const V v) { + const DFromV d; + const size_t lpb = detail::LanesPerBlock(d); + const V down = detail::SlideDown(v, v, lpb); + const V up = detail::SlideUp(v, v, lpb); + return OddEvenBlocks(up, down); +} + +// ------------------------------ TableLookupLanes + +template +HWY_API VFromD> IndicesFromVec(D d, VI vec) { + static_assert(sizeof(TFromD) == sizeof(TFromV), "Index != lane"); + const RebindToUnsigned du; // instead of : avoids unused d. + const auto indices = BitCast(du, vec); +#if HWY_IS_DEBUG_BUILD + HWY_DASSERT(AllTrue(du, detail::LtS(indices, Lanes(d)))); +#endif + return indices; +} + +template +HWY_API VFromD> SetTableIndices(D d, const TI* idx) { + static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); + return IndicesFromVec(d, LoadU(Rebind(), idx)); +} + +// <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX +// to 2048! We could instead use vrgatherei16. +#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, idx, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL) +#undef HWY_RVV_TABLE + +// ------------------------------ ConcatOdd (TableLookupLanes) +template +HWY_API V ConcatOdd(D d, const V hi, const V lo) { + const RebindToUnsigned du; // Iota0 is unsigned only + const auto iota = detail::Iota0(du); + const auto idx = detail::AddS(Add(iota, iota), 1); + const auto lo_odd = TableLookupLanes(lo, idx); + const auto hi_odd = TableLookupLanes(hi, idx); + return detail::SlideUp(lo_odd, hi_odd, Lanes(d) / 2); +} + +// ------------------------------ ConcatEven (TableLookupLanes) +template +HWY_API V ConcatEven(D d, const V hi, const V lo) { + const RebindToUnsigned du; // Iota0 is unsigned only + const auto iota = detail::Iota0(du); + const auto idx = Add(iota, iota); + const auto lo_even = TableLookupLanes(lo, idx); + const auto hi_even = TableLookupLanes(hi, idx); + return detail::SlideUp(lo_even, hi_even, Lanes(d) / 2); +} + +// ------------------------------ Reverse (TableLookupLanes) +template +HWY_API VFromD Reverse(D /* tag */, VFromD v) { + const RebindToUnsigned du; + using TU = TFromD; + const size_t N = Lanes(du); + const auto idx = + detail::ReverseSubS(detail::Iota0(du), static_cast(N - 1)); + return TableLookupLanes(v, idx); +} + +// ------------------------------ Reverse2 (RotateRight, OddEven) + +// Shifting and adding requires fewer instructions than blending, but casting to +// u32 only works for LMUL in [1/2, 8]. +template +HWY_API VFromD Reverse2(D d, const VFromD v) { + const Repartition du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +} +// For LMUL < 1/2, we can extend and then truncate. +template +HWY_API VFromD Reverse2(D d, const VFromD v) { + const Twice d2; + const Twice d4; + const Repartition du32; + const auto vx = detail::Ext(d4, detail::Ext(d2, v)); + const auto rx = BitCast(d4, RotateRight<16>(BitCast(du32, vx))); + return detail::Trunc(detail::Trunc(rx)); +} + +// Shifting and adding requires fewer instructions than blending, but casting to +// u64 does not work for LMUL < 1. +template +HWY_API VFromD Reverse2(D d, const VFromD v) { + const Repartition du64; + return BitCast(d, RotateRight<32>(BitCast(du64, v))); +} + +// For fractions, we can extend and then truncate. +template +HWY_API VFromD Reverse2(D d, const VFromD v) { + const Twice d2; + const Twice d4; + const Repartition du64; + const auto vx = detail::Ext(d4, detail::Ext(d2, v)); + const auto rx = BitCast(d4, RotateRight<32>(BitCast(du64, vx))); + return detail::Trunc(detail::Trunc(rx)); +} + +template , HWY_IF_LANE_SIZE_D(D, 8)> +HWY_API V Reverse2(D /* tag */, const V v) { + const V up = detail::Slide1Up(v); + const V down = detail::Slide1Down(v); + return OddEven(up, down); +} + +// ------------------------------ Reverse4 (TableLookupLanes) + +template +HWY_API VFromD Reverse4(D d, const VFromD v) { + const RebindToUnsigned du; + const auto idx = detail::XorS(detail::Iota0(du), 3); + return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); +} + +// ------------------------------ Reverse8 (TableLookupLanes) + +template +HWY_API VFromD Reverse8(D d, const VFromD v) { + const RebindToUnsigned du; + const auto idx = detail::XorS(detail::Iota0(du), 7); + return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); +} + +// ------------------------------ ReverseBlocks (Reverse, Shuffle01) +template > +HWY_API V ReverseBlocks(D d, V v) { + const Repartition du64; + const size_t N = Lanes(du64); + const auto rev = + detail::ReverseSubS(detail::Iota0(du64), static_cast(N - 1)); + // Swap lo/hi u64 within each block + const auto idx = detail::XorS(rev, 1); + return BitCast(d, TableLookupLanes(BitCast(du64, v), idx)); +} + +// ------------------------------ Compress + +template +struct CompressIsPartition { + enum { value = 0 }; +}; + +#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ + return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_UI163264(HWY_RVV_COMPRESS, Compress, compress, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress, _ALL) +#undef HWY_RVV_COMPRESS + +// ------------------------------ CompressNot +template +HWY_API V CompressNot(V v, const M mask) { + return Compress(v, Not(mask)); +} + +// ------------------------------ CompressBlocksNot +template +HWY_API V CompressBlocksNot(V v, const M mask) { + return CompressNot(v, mask); +} + +// ------------------------------ CompressStore +template +HWY_API size_t CompressStore(const V v, const M mask, const D d, + TFromD* HWY_RESTRICT unaligned) { + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d, + TFromD* HWY_RESTRICT unaligned) { + const size_t count = CountTrue(d, mask); + detail::StoreN(count, Compress(v, mask), d, unaligned); + return count; +} + +// ================================================== BLOCKWISE + +// ------------------------------ CombineShiftRightBytes +template > +HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) { + const Repartition d8; + const auto hi8 = BitCast(d8, hi); + const auto lo8 = BitCast(d8, lo); + const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes); + const auto lo_down = detail::SlideDown(lo8, lo8, kBytes); + const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8); + return BitCast(d, IfThenElse(is_lo, lo_down, hi_up)); +} + +// ------------------------------ CombineShiftRightLanes +template > +HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) { + constexpr size_t kLanesUp = 16 / sizeof(TFromV) - kLanes; + const auto hi_up = detail::SlideUp(hi, hi, kLanesUp); + const auto lo_down = detail::SlideDown(lo, lo, kLanes); + const auto is_lo = detail::FirstNPerBlock(d); + return IfThenElse(is_lo, lo_down, hi_up); +} + +// ------------------------------ Shuffle2301 (ShiftLeft) +template +HWY_API V Shuffle2301(const V v) { + const DFromV d; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + const Repartition du64; + const auto v64 = BitCast(du64, v); + return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64))); +} + +// ------------------------------ Shuffle2103 +template +HWY_API V Shuffle2103(const V v) { + const DFromV d; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + return CombineShiftRightLanes<3>(d, v, v); +} + +// ------------------------------ Shuffle0321 +template +HWY_API V Shuffle0321(const V v) { + const DFromV d; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + return CombineShiftRightLanes<1>(d, v, v); +} + +// ------------------------------ Shuffle1032 +template +HWY_API V Shuffle1032(const V v) { + const DFromV d; + static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); + return CombineShiftRightLanes<2>(d, v, v); +} + +// ------------------------------ Shuffle01 +template +HWY_API V Shuffle01(const V v) { + const DFromV d; + static_assert(sizeof(TFromD) == 8, "Defined for 64-bit types"); + return CombineShiftRightLanes<1>(d, v, v); +} + +// ------------------------------ Shuffle0123 +template +HWY_API V Shuffle0123(const V v) { + return Shuffle2301(Shuffle1032(v)); +} + +// ------------------------------ TableLookupBytes + +// Extends or truncates a vector to match the given d. +namespace detail { + +template +HWY_INLINE auto ChangeLMUL(Simd d, VFromD> v) + -> VFromD { + const Simd dh; + const Simd dhh; + return Ext(d, Ext(dh, Ext(dhh, v))); +} +template +HWY_INLINE auto ChangeLMUL(Simd d, VFromD> v) + -> VFromD { + const Simd dh; + return Ext(d, Ext(dh, v)); +} +template +HWY_INLINE auto ChangeLMUL(Simd d, VFromD> v) + -> VFromD { + return Ext(d, v); +} + +template +HWY_INLINE auto ChangeLMUL(Simd d, VFromD v) + -> VFromD { + return v; +} + +template +HWY_INLINE auto ChangeLMUL(Simd d, VFromD> v) + -> VFromD { + return Trunc(v); +} +template +HWY_INLINE auto ChangeLMUL(Simd d, VFromD> v) + -> VFromD { + return Trunc(Trunc(v)); +} +template +HWY_INLINE auto ChangeLMUL(Simd d, VFromD> v) + -> VFromD { + return Trunc(Trunc(Trunc(v))); +} + +} // namespace detail + +template +HWY_API VI TableLookupBytes(const VT vt, const VI vi) { + const DFromV dt; // T=table, I=index. + const DFromV di; + const Repartition dt8; + const Repartition di8; + // Required for producing half-vectors with table lookups from a full vector. + // If we instead run at the LMUL of the index vector, lookups into the table + // would be truncated. Thus we run at the larger of the two LMULs and truncate + // the result vector to the original index LMUL. + constexpr int kPow2T = Pow2(dt8); + constexpr int kPow2I = Pow2(di8); + const Simd dm8; // m=max + const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt)); + const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi)); + auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8)); + // If the table is shorter, wrap around offsets so they do not reference + // undefined lanes in the newly extended vmt. + if (kPow2T < kPow2I) { + offsets = detail::AndS(offsets, static_cast(Lanes(dt8) - 1)); + } + const auto out = TableLookupLanes(vmt, Add(vmi, offsets)); + return BitCast(di, detail::ChangeLMUL(di8, out)); +} + +template +HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) { + const DFromV di; + const Repartition di8; + const auto idx8 = BitCast(di8, idx); + const auto lookup = TableLookupBytes(vt, idx8); + return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup)); +} + +// ------------------------------ Broadcast +template +HWY_API V Broadcast(const V v) { + const DFromV d; + HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d)); + auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(d)); + if (kLane != 0) { + idx = detail::AddS(idx, kLane); + } + return TableLookupLanes(v, idx); +} + +// ------------------------------ ShiftLeftLanes + +template > +HWY_API V ShiftLeftLanes(const D d, const V v) { + const RebindToSigned di; + using TI = TFromD; + const auto shifted = detail::SlideUp(v, v, kLanes); + // Match x86 semantics by zeroing lower lanes in 128-bit blocks + const auto idx_mod = + detail::AndS(BitCast(di, detail::Iota0(di)), + static_cast(detail::LanesPerBlock(di) - 1)); + const auto clear = detail::LtS(idx_mod, static_cast(kLanes)); + return IfThenZeroElse(clear, shifted); +} + +template +HWY_API V ShiftLeftLanes(const V v) { + return ShiftLeftLanes(DFromV(), v); +} + +// ------------------------------ ShiftLeftBytes + +template +HWY_API VFromD ShiftLeftBytes(D d, const VFromD v) { + const Repartition d8; + return BitCast(d, ShiftLeftLanes(BitCast(d8, v))); +} + +template +HWY_API V ShiftLeftBytes(const V v) { + return ShiftLeftBytes(DFromV(), v); +} + +// ------------------------------ ShiftRightLanes +template >> +HWY_API V ShiftRightLanes(const Simd d, V v) { + const RebindToSigned di; + using TI = TFromD; + // For partial vectors, clear upper lanes so we shift in zeros. + if (N <= 16 / sizeof(T)) { + v = IfThenElseZero(FirstN(d, N), v); + } + + const auto shifted = detail::SlideDown(v, v, kLanes); + // Match x86 semantics by zeroing upper lanes in 128-bit blocks + const size_t lpb = detail::LanesPerBlock(di); + const auto idx_mod = + detail::AndS(BitCast(di, detail::Iota0(di)), static_cast(lpb - 1)); + const auto keep = detail::LtS(idx_mod, static_cast(lpb - kLanes)); + return IfThenElseZero(keep, shifted); +} + +// ------------------------------ ShiftRightBytes +template > +HWY_API V ShiftRightBytes(const D d, const V v) { + const Repartition d8; + return BitCast(d, ShiftRightLanes(d8, BitCast(d8, v))); +} + +// ------------------------------ InterleaveLower + +template +HWY_API V InterleaveLower(D d, const V a, const V b) { + static_assert(IsSame, TFromV>(), "D/V mismatch"); + const RebindToUnsigned du; + using TU = TFromD; + const auto i = detail::Iota0(du); + const auto idx_mod = ShiftRight<1>( + detail::AndS(i, static_cast(detail::LanesPerBlock(du) - 1))); + const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i)); + const auto is_even = detail::EqS(detail::AndS(i, 1), 0u); + return IfThenElse(is_even, TableLookupLanes(a, idx), + TableLookupLanes(b, idx)); +} + +template +HWY_API V InterleaveLower(const V a, const V b) { + return InterleaveLower(DFromV(), a, b); +} + +// ------------------------------ InterleaveUpper + +template +HWY_API V InterleaveUpper(const D d, const V a, const V b) { + static_assert(IsSame, TFromV>(), "D/V mismatch"); + const RebindToUnsigned du; + using TU = TFromD; + const size_t lpb = detail::LanesPerBlock(du); + const auto i = detail::Iota0(du); + const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast(lpb - 1))); + const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i)); + const auto idx = detail::AddS(idx_lower, static_cast(lpb / 2)); + const auto is_even = detail::EqS(detail::AndS(i, 1), 0u); + return IfThenElse(is_even, TableLookupLanes(a, idx), + TableLookupLanes(b, idx)); +} + +// ------------------------------ ZipLower + +template >> +HWY_API VFromD ZipLower(DW dw, V a, V b) { + const RepartitionToNarrow dn; + static_assert(IsSame, TFromV>(), "D/V mismatch"); + return BitCast(dw, InterleaveLower(dn, a, b)); +} + +template >> +HWY_API VFromD ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} + +// ------------------------------ ZipUpper +template +HWY_API VFromD ZipUpper(DW dw, V a, V b) { + const RepartitionToNarrow dn; + static_assert(IsSame, TFromV>(), "D/V mismatch"); + return BitCast(dw, InterleaveUpper(dn, a, b)); +} + +// ================================================== REDUCE + +// vector = f(vector, zero_m1) +#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \ + return Set(d, GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \ + v0, v, v0, Lanes(d)))); \ + } + +// ------------------------------ SumOfLanes + +namespace detail { +HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL) +} // namespace detail + +template +HWY_API VFromD SumOfLanes(D d, const VFromD v) { + const auto v0 = Zero(ScalableTag>()); // always m1 + return detail::RedSum(d, v, v0); +} + +// ------------------------------ MinOfLanes +namespace detail { +HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL) +} // namespace detail + +template +HWY_API VFromD MinOfLanes(D d, const VFromD v) { + using T = TFromD; + const ScalableTag d1; // always m1 + const auto neutral = Set(d1, HighestValue()); + return detail::RedMin(d, v, neutral); +} + +// ------------------------------ MaxOfLanes +namespace detail { +HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL) +HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL) +HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL) +} // namespace detail + +template +HWY_API VFromD MaxOfLanes(D d, const VFromD v) { + using T = TFromD; + const ScalableTag d1; // always m1 + const auto neutral = Set(d1, LowestValue()); + return detail::RedMax(d, v, neutral); +} + +#undef HWY_RVV_REDUCE + +// ================================================== Ops with dependencies + +// ------------------------------ PopulationCount (ShiftRight) + +// Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot. +template , HWY_IF_LANE_SIZE_D(D, 1), + hwy::EnableIf* = nullptr> +HWY_API V PopulationCount(V v) { + // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 + v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55)); + v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33)); + return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F); +} + +// ------------------------------ LoadDup128 + +template +HWY_API VFromD LoadDup128(D d, const TFromD* const HWY_RESTRICT p) { + const VFromD loaded = Load(d, p); + // idx must be unsigned for TableLookupLanes. + using TU = MakeUnsigned>; + const TU mask = static_cast(detail::LanesPerBlock(d) - 1); + // Broadcast the first block. + const VFromD> idx = detail::AndS(detail::Iota0(d), mask); + return TableLookupLanes(loaded, idx); +} + +// ------------------------------ LoadMaskBits + +// Support all combinations of T and SHIFT(LMUL) without explicit overloads for +// each. First overload for MLEN=1..64. +namespace detail { + +// Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN +// increases with lane size and decreases for increasing LMUL. Cap at 64, the +// largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL +// e.g. vuint16mf8_t: (8*2 << 3) == 128. +template +using MaskTag = hwy::SizeTag), -Pow2(D())))>; + +#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ + HWY_INLINE HWY_RVV_M(MLEN) \ + NAME(hwy::SizeTag /* tag */, const uint8_t* bits, size_t N) { \ + return OP##_v_b##MLEN(bits, N); \ + } +HWY_RVV_FOREACH_B(HWY_RVV_LOAD_MASK_BITS, LoadMaskBits, vlm) +#undef HWY_RVV_LOAD_MASK_BITS +} // namespace detail + +template > +HWY_API auto LoadMaskBits(D d, const uint8_t* bits) + -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) { + return detail::LoadMaskBits(MT(), bits, Lanes(d)); +} + +// ------------------------------ StoreMaskBits +#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \ + const size_t N = Lanes(d); \ + OP##_v_b##MLEN(bits, m, N); \ + /* Non-full byte, need to clear the undefined upper bits. */ \ + /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \ + constexpr bool kLessThan8 = \ + detail::ScaleByPower(16 / sizeof(TFromD), Pow2(d)) < 8; \ + if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \ + const int mask = (1 << N) - 1; \ + bits[0] = static_cast(bits[0] & mask); \ + } \ + return (N + 7) / 8; \ + } +HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, StoreMaskBits, vsm) +#undef HWY_RVV_STORE_MASK_BITS + +// ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits) + +template +HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(DFromV(), bits)); +} + +template +HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp) + +// Disallow for 8-bit because Iota is likely to overflow. +template +HWY_API MFromD FirstN(const D d, const size_t n) { + const RebindToSigned di; + using TI = TFromD; + return RebindMask( + d, detail::LtS(BitCast(di, detail::Iota0(d)), static_cast(n))); +} + +template +HWY_API MFromD FirstN(const D d, const size_t n) { + const auto zero = Zero(d); + const auto one = Set(d, 1); + return Eq(detail::SlideUp(one, zero, n), one); +} + +// ------------------------------ Neg (Sub) + +template +HWY_API V Neg(const V v) { + return detail::ReverseSubS(v, 0); +} + +// vector = f(vector), but argument is repeated +#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, v, HWY_RVV_AVL(SEW, SHIFT)); \ + } + +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL) + +// ------------------------------ Abs (Max, Neg) + +template +HWY_API V Abs(const V v) { + return Max(v, Neg(v)); +} + +HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Abs, fsgnjx, _ALL) + +#undef HWY_RVV_RETV_ARGV2 + +// ------------------------------ AbsDiff (Abs, Sub) +template +HWY_API V AbsDiff(const V a, const V b) { + return Abs(Sub(a, b)); +} + +// ------------------------------ Round (NearestInt, ConvertTo, CopySign) + +// IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have +// a dedicated instruction for that. Rounding to integer and converting back to +// float is correct except when the input magnitude is large, in which case the +// input was already an integer (because mantissa >> exponent is zero). + +namespace detail { +enum RoundingModes { kNear, kTrunc, kDown, kUp }; + +template +HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) { + return detail::LtS(Abs(v), MantissaEnd>()); +} + +} // namespace detail + +template +HWY_API V Round(const V v) { + const DFromV df; + + const auto integer = NearestInt(v); // round using current mode + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); +} + +// ------------------------------ Trunc (ConvertTo) +template +HWY_API V Trunc(const V v) { + const DFromV df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); +} + +// ------------------------------ Ceil +template +HWY_API V Ceil(const V v) { + asm volatile("fsrm %0" ::"r"(detail::kUp)); + const auto ret = Round(v); + asm volatile("fsrm %0" ::"r"(detail::kNear)); + return ret; +} + +// ------------------------------ Floor +template +HWY_API V Floor(const V v) { + asm volatile("fsrm %0" ::"r"(detail::kDown)); + const auto ret = Round(v); + asm volatile("fsrm %0" ::"r"(detail::kNear)); + return ret; +} + +// ------------------------------ Floating-point classification (Ne) + +// vfclass does not help because it would require 3 instructions (to AND and +// then compare the bits), whereas these are just 1-3 integer instructions. + +template +HWY_API MFromD> IsNaN(const V v) { + return Ne(v, v); +} + +template > +HWY_API MFromD IsInf(const V v) { + const D d; + const RebindToSigned di; + using T = TFromD; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2())); +} + +// Returns whether normal/subnormal/zero. +template > +HWY_API MFromD IsFinite(const V v) { + const D d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + using T = TFromD; + const VFromD vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD exp = + BitCast(di, ShiftRight() + 1>(Add(vu, vu))); + return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField())); +} + +// ------------------------------ Iota (ConvertTo) + +template +HWY_API VFromD Iota(const D d, TFromD first) { + return detail::AddS(detail::Iota0(d), first); +} + +template +HWY_API VFromD Iota(const D d, TFromD first) { + const RebindToUnsigned du; + return detail::AddS(BitCast(d, detail::Iota0(du)), first); +} + +template +HWY_API VFromD Iota(const D d, TFromD first) { + const RebindToUnsigned du; + const RebindToSigned di; + return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first); +} + +// ------------------------------ MulEven/Odd (Mul, OddEven) + +template , + class DW = RepartitionToWide> +HWY_API VFromD MulEven(const V a, const V b) { + const auto lo = Mul(a, b); + const auto hi = detail::MulHigh(a, b); + return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo)); +} + +// There is no 64x64 vwmul. +template +HWY_INLINE V MulEven(const V a, const V b) { + const auto lo = Mul(a, b); + const auto hi = detail::MulHigh(a, b); + return OddEven(detail::Slide1Up(hi), lo); +} + +template +HWY_INLINE V MulOdd(const V a, const V b) { + const auto lo = Mul(a, b); + const auto hi = detail::MulHigh(a, b); + return OddEven(hi, detail::Slide1Down(lo)); +} + +// ------------------------------ ReorderDemote2To (OddEven, Combine) + +template +HWY_API VFromD> ReorderDemote2To( + Simd dbf16, + VFromD> a, + VFromD> b) { + const RebindToUnsigned du16; + const RebindToUnsigned> du32; + const VFromD b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +// If LMUL is not the max, Combine first to avoid another DemoteTo. +template * = nullptr, + class D32 = RepartitionToWide>> +HWY_API VFromD> ReorderDemote2To( + Simd d16, VFromD a, VFromD b) { + const Twice d32t; + const VFromD ab = Combine(d32t, a, b); + return DemoteTo(d16, ab); +} + +// Max LMUL: must DemoteTo first, then Combine. +template >>> +HWY_API VFromD> ReorderDemote2To(Simd d16, + V32 a, V32 b) { + const Half d16h; + const VFromD a16 = DemoteTo(d16h, a); + const VFromD b16 = DemoteTo(d16h, b); + return Combine(d16, a16, b16); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +namespace detail { + +// Non-overloaded wrapper function so we can define DF32 in template args. +template < + size_t N, int kPow2, class DF32 = Simd, + class VF32 = VFromD, + class DU16 = RepartitionToNarrow>>> +HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd df32, + VFromD a, VFromD b, + const VF32 sum0, VF32& sum1) { + const DU16 du16; + const RebindToUnsigned du32; + using VU32 = VFromD; + const VFromD zero = Zero(du16); + const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a)); + const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b)); + const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ + HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \ + HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ + return OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \ + } + +HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, vwmacc_vv_, _EXT_VIRT) +#undef HWY_RVV_WIDEN_MACC + +// If LMUL is not the max, we can WidenMul first (3 instructions). +template * = nullptr, + class D32 = Simd, class V32 = VFromD, + class D16 = RepartitionToNarrow> +HWY_API VFromD ReorderWidenMulAccumulateI16(Simd d32, + VFromD a, VFromD b, + const V32 sum0, V32& sum1) { + const Twice d32t; + using V32T = VFromD; + V32T sum = Combine(d32t, sum0, sum1); + sum = detail::WidenMulAcc(d32t, sum, a, b); + sum1 = UpperHalf(d32, sum); + return LowerHalf(d32, sum); +} + +// Max LMUL: must LowerHalf first (4 instructions). +template , class V32 = VFromD, + class D16 = RepartitionToNarrow> +HWY_API VFromD ReorderWidenMulAccumulateI16(Simd d32, + VFromD a, VFromD b, + const V32 sum0, V32& sum1) { + const Half d16h; + using V16H = VFromD; + const V16H a0 = LowerHalf(d16h, a); + const V16H a1 = UpperHalf(d16h, a); + const V16H b0 = LowerHalf(d16h, b); + const V16H b1 = UpperHalf(d16h, b); + sum1 = detail::WidenMulAcc(d32, sum1, a1, b1); + return detail::WidenMulAcc(d32, sum0, a0, b0); +} + +} // namespace detail + +template +HWY_API VW ReorderWidenMulAccumulate(Simd d32, VN a, VN b, + const VW sum0, VW& sum1) { + return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1); +} + +template +HWY_API VW ReorderWidenMulAccumulate(Simd d32, VN a, VN b, + const VW sum0, VW& sum1) { + return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1); +} + +// ------------------------------ Lt128 +template +HWY_INLINE MFromD Lt128(D d, const VFromD a, const VFromD b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + // Truth table of Eq and Compare for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const VFromD eqHL = VecFromMask(d, Eq(a, b)); + const VFromD ltHL = VecFromMask(d, Lt(a, b)); + // Shift leftward so L can influence H. + const VFromD ltLx = detail::Slide1Up(ltHL); + const VFromD vecHx = OrAnd(ltHL, eqHL, ltLx); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx))); +} + +// ------------------------------ Lt128Upper +template +HWY_INLINE MFromD Lt128Upper(D d, const VFromD a, const VFromD b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const VFromD ltHL = VecFromMask(d, Lt(a, b)); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL))); +} + +// ------------------------------ Eq128 +template +HWY_INLINE MFromD Eq128(D d, const VFromD a, const VFromD b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const VFromD eqHL = VecFromMask(d, Eq(a, b)); + const VFromD eqLH = Reverse2(d, eqHL); + return MaskFromVec(And(eqHL, eqLH)); +} + +// ------------------------------ Eq128Upper +template +HWY_INLINE MFromD Eq128Upper(D d, const VFromD a, const VFromD b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const VFromD eqHL = VecFromMask(d, Eq(a, b)); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL))); +} + +// ------------------------------ Ne128 +template +HWY_INLINE MFromD Ne128(D d, const VFromD a, const VFromD b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const VFromD neHL = VecFromMask(d, Ne(a, b)); + const VFromD neLH = Reverse2(d, neHL); + return MaskFromVec(Or(neHL, neLH)); +} + +// ------------------------------ Ne128Upper +template +HWY_INLINE MFromD Ne128Upper(D d, const VFromD a, const VFromD b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const VFromD neHL = VecFromMask(d, Ne(a, b)); + // Replicate H to its neighbor. + return MaskFromVec(OddEven(neHL, detail::Slide1Down(neHL))); +} + +// ------------------------------ Min128, Max128 (Lt128) + +template +HWY_INLINE VFromD Min128(D /* tag */, const VFromD a, const VFromD b) { + const VFromD aXH = detail::Slide1Down(a); + const VFromD bXH = detail::Slide1Down(b); + const VFromD minHL = Min(a, b); + const MFromD ltXH = Lt(aXH, bXH); + const MFromD eqXH = Eq(aXH, bXH); + // If the upper lane is the decider, take lo from the same reg. + const VFromD lo = IfThenElse(ltXH, a, b); + // The upper lane is just minHL; if they are equal, we also need to use the + // actual min of the lower lanes. + return OddEven(minHL, IfThenElse(eqXH, minHL, lo)); +} + +template +HWY_INLINE VFromD Max128(D /* tag */, const VFromD a, const VFromD b) { + const VFromD aXH = detail::Slide1Down(a); + const VFromD bXH = detail::Slide1Down(b); + const VFromD maxHL = Max(a, b); + const MFromD ltXH = Lt(aXH, bXH); + const MFromD eqXH = Eq(aXH, bXH); + // If the upper lane is the decider, take lo from the same reg. + const VFromD lo = IfThenElse(ltXH, b, a); + // The upper lane is just maxHL; if they are equal, we also need to use the + // actual min of the lower lanes. + return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo)); +} + +template +HWY_INLINE VFromD Min128Upper(D d, VFromD a, VFromD b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template +HWY_INLINE VFromD Max128Upper(D d, VFromD a, VFromD b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// ================================================== END MACROS +namespace detail { // for code folding +#undef HWY_RVV_AVL +#undef HWY_RVV_D +#undef HWY_RVV_FOREACH +#undef HWY_RVV_FOREACH_08_ALL +#undef HWY_RVV_FOREACH_08_ALL_VIRT +#undef HWY_RVV_FOREACH_08_DEMOTE +#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_08_EXT +#undef HWY_RVV_FOREACH_08_EXT_VIRT +#undef HWY_RVV_FOREACH_08_TRUNC +#undef HWY_RVV_FOREACH_08_VIRT +#undef HWY_RVV_FOREACH_16_ALL +#undef HWY_RVV_FOREACH_16_ALL_VIRT +#undef HWY_RVV_FOREACH_16_DEMOTE +#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_16_EXT +#undef HWY_RVV_FOREACH_16_EXT_VIRT +#undef HWY_RVV_FOREACH_16_TRUNC +#undef HWY_RVV_FOREACH_16_VIRT +#undef HWY_RVV_FOREACH_32_ALL +#undef HWY_RVV_FOREACH_32_ALL_VIRT +#undef HWY_RVV_FOREACH_32_DEMOTE +#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_32_EXT +#undef HWY_RVV_FOREACH_32_EXT_VIRT +#undef HWY_RVV_FOREACH_32_TRUNC +#undef HWY_RVV_FOREACH_32_VIRT +#undef HWY_RVV_FOREACH_64_ALL +#undef HWY_RVV_FOREACH_64_ALL_VIRT +#undef HWY_RVV_FOREACH_64_DEMOTE +#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT +#undef HWY_RVV_FOREACH_64_EXT +#undef HWY_RVV_FOREACH_64_EXT_VIRT +#undef HWY_RVV_FOREACH_64_TRUNC +#undef HWY_RVV_FOREACH_64_VIRT +#undef HWY_RVV_FOREACH_B +#undef HWY_RVV_FOREACH_F +#undef HWY_RVV_FOREACH_F16 +#undef HWY_RVV_FOREACH_F32 +#undef HWY_RVV_FOREACH_F3264 +#undef HWY_RVV_FOREACH_F64 +#undef HWY_RVV_FOREACH_I +#undef HWY_RVV_FOREACH_I08 +#undef HWY_RVV_FOREACH_I16 +#undef HWY_RVV_FOREACH_I163264 +#undef HWY_RVV_FOREACH_I32 +#undef HWY_RVV_FOREACH_I64 +#undef HWY_RVV_FOREACH_U +#undef HWY_RVV_FOREACH_U08 +#undef HWY_RVV_FOREACH_U16 +#undef HWY_RVV_FOREACH_U163264 +#undef HWY_RVV_FOREACH_U32 +#undef HWY_RVV_FOREACH_U64 +#undef HWY_RVV_FOREACH_UI +#undef HWY_RVV_FOREACH_UI08 +#undef HWY_RVV_FOREACH_UI16 +#undef HWY_RVV_FOREACH_UI163264 +#undef HWY_RVV_FOREACH_UI32 +#undef HWY_RVV_FOREACH_UI3264 +#undef HWY_RVV_FOREACH_UI64 +#undef HWY_RVV_M +#undef HWY_RVV_RETM_ARGM +#undef HWY_RVV_RETV_ARGV +#undef HWY_RVV_RETV_ARGVS +#undef HWY_RVV_RETV_ARGVV +#undef HWY_RVV_T +#undef HWY_RVV_V +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h new file mode 100644 index 0000000..8b11828 --- /dev/null +++ b/hwy/ops/scalar-inl.h @@ -0,0 +1,1571 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Single-element vectors and operations. +// External include guard in highway.h - see comment there. + +#include +#include + +#include "hwy/base.h" +#include "hwy/ops/shared-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Single instruction, single data. +template +using Sisd = Simd; + +// (Wrapper class required for overloading comparison operators.) +template +struct Vec1 { + HWY_INLINE Vec1() = default; + Vec1(const Vec1&) = default; + Vec1& operator=(const Vec1&) = default; + HWY_INLINE explicit Vec1(const T t) : raw(t) {} + + HWY_INLINE Vec1& operator*=(const Vec1 other) { + return *this = (*this * other); + } + HWY_INLINE Vec1& operator/=(const Vec1 other) { + return *this = (*this / other); + } + HWY_INLINE Vec1& operator+=(const Vec1 other) { + return *this = (*this + other); + } + HWY_INLINE Vec1& operator-=(const Vec1 other) { + return *this = (*this - other); + } + HWY_INLINE Vec1& operator&=(const Vec1 other) { + return *this = (*this & other); + } + HWY_INLINE Vec1& operator|=(const Vec1 other) { + return *this = (*this | other); + } + HWY_INLINE Vec1& operator^=(const Vec1 other) { + return *this = (*this ^ other); + } + + T raw; +}; + +// 0 or FF..FF, same size as Vec1. +template +class Mask1 { + using Raw = hwy::MakeUnsigned; + + public: + static HWY_INLINE Mask1 FromBool(bool b) { + Mask1 mask; + mask.bits = b ? static_cast(~Raw{0}) : 0; + return mask; + } + + Raw bits; +}; + +namespace detail { + +// Deduce Sisd from Vec1 +struct Deduce1 { + template + Sisd operator()(Vec1) const { + return Sisd(); + } +}; + +} // namespace detail + +template +using DFromV = decltype(detail::Deduce1()(V())); + +template +using TFromV = TFromD>; + +// ------------------------------ BitCast + +template +HWY_API Vec1 BitCast(Sisd /* tag */, Vec1 v) { + static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined"); + T to; + CopyBytes(&v.raw, &to); // not same size - ok to shrink + return Vec1(to); +} + +// ------------------------------ Set + +template +HWY_API Vec1 Zero(Sisd /* tag */) { + return Vec1(T(0)); +} + +template +HWY_API Vec1 Set(Sisd /* tag */, const T2 t) { + return Vec1(static_cast(t)); +} + +template +HWY_API Vec1 Undefined(Sisd d) { + return Zero(d); +} + +template +HWY_API Vec1 Iota(const Sisd /* tag */, const T2 first) { + return Vec1(static_cast(first)); +} + +template +using VFromD = decltype(Zero(D())); + +// ================================================== LOGICAL + +// ------------------------------ Not + +template +HWY_API Vec1 Not(const Vec1 v) { + using TU = MakeUnsigned; + const Sisd du; + return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, v).raw))); +} + +// ------------------------------ And + +template +HWY_API Vec1 And(const Vec1 a, const Vec1 b) { + using TU = MakeUnsigned; + const Sisd du; + return BitCast(Sisd(), Vec1(BitCast(du, a).raw & BitCast(du, b).raw)); +} +template +HWY_API Vec1 operator&(const Vec1 a, const Vec1 b) { + return And(a, b); +} + +// ------------------------------ AndNot + +template +HWY_API Vec1 AndNot(const Vec1 a, const Vec1 b) { + using TU = MakeUnsigned; + const Sisd du; + return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, a).raw & + BitCast(du, b).raw))); +} + +// ------------------------------ Or + +template +HWY_API Vec1 Or(const Vec1 a, const Vec1 b) { + using TU = MakeUnsigned; + const Sisd du; + return BitCast(Sisd(), Vec1(BitCast(du, a).raw | BitCast(du, b).raw)); +} +template +HWY_API Vec1 operator|(const Vec1 a, const Vec1 b) { + return Or(a, b); +} + +// ------------------------------ Xor + +template +HWY_API Vec1 Xor(const Vec1 a, const Vec1 b) { + using TU = MakeUnsigned; + const Sisd du; + return BitCast(Sisd(), Vec1(BitCast(du, a).raw ^ BitCast(du, b).raw)); +} +template +HWY_API Vec1 operator^(const Vec1 a, const Vec1 b) { + return Xor(a, b); +} + +// ------------------------------ Or3 + +template +HWY_API Vec1 Or3(Vec1 o1, Vec1 o2, Vec1 o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd + +template +HWY_API Vec1 OrAnd(const Vec1 o, const Vec1 a1, const Vec1 a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse + +template +HWY_API Vec1 IfVecThenElse(Vec1 mask, Vec1 yes, Vec1 no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ CopySign + +template +HWY_API Vec1 CopySign(const Vec1 magn, const Vec1 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + const auto msb = SignBit(Sisd()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template +HWY_API Vec1 CopySignToAbs(const Vec1 abs, const Vec1 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(Sisd()), sign)); +} + +// ------------------------------ BroadcastSignBit + +template +HWY_API Vec1 BroadcastSignBit(const Vec1 v) { + // This is used inside ShiftRight, so we cannot implement in terms of it. + return v.raw < 0 ? Vec1(T(-1)) : Vec1(0); +} + +// ------------------------------ PopulationCount + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +template +HWY_API Vec1 PopulationCount(Vec1 v) { + return Vec1(static_cast(PopCount(v.raw))); +} + +// ------------------------------ Mask + +template +HWY_API Mask1 RebindMask(Sisd /*tag*/, Mask1 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask1{m.bits}; +} + +// v must be 0 or FF..FF. +template +HWY_API Mask1 MaskFromVec(const Vec1 v) { + Mask1 mask; + CopySameSize(&v, &mask); + return mask; +} + +template +Vec1 VecFromMask(const Mask1 mask) { + Vec1 v; + CopySameSize(&mask, &v); + return v; +} + +template +Vec1 VecFromMask(Sisd /* tag */, const Mask1 mask) { + Vec1 v; + CopySameSize(&mask, &v); + return v; +} + +template +HWY_API Mask1 FirstN(Sisd /*tag*/, size_t n) { + return Mask1::FromBool(n != 0); +} + +// Returns mask ? yes : no. +template +HWY_API Vec1 IfThenElse(const Mask1 mask, const Vec1 yes, + const Vec1 no) { + return mask.bits ? yes : no; +} + +template +HWY_API Vec1 IfThenElseZero(const Mask1 mask, const Vec1 yes) { + return mask.bits ? yes : Vec1(0); +} + +template +HWY_API Vec1 IfThenZeroElse(const Mask1 mask, const Vec1 no) { + return mask.bits ? Vec1(0) : no; +} + +template +HWY_API Vec1 IfNegativeThenElse(Vec1 v, Vec1 yes, Vec1 no) { + return v.raw < 0 ? yes : no; +} + +template +HWY_API Vec1 ZeroIfNegative(const Vec1 v) { + return v.raw < 0 ? Vec1(0) : v; +} + +// ------------------------------ Mask logical + +template +HWY_API Mask1 Not(const Mask1 m) { + return MaskFromVec(Not(VecFromMask(Sisd(), m))); +} + +template +HWY_API Mask1 And(const Mask1 a, Mask1 b) { + const Sisd d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask1 AndNot(const Mask1 a, Mask1 b) { + const Sisd d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask1 Or(const Mask1 a, Mask1 b) { + const Sisd d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask1 Xor(const Mask1 a, Mask1 b) { + const Sisd d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask1 ExclusiveNeither(const Mask1 a, Mask1 b) { + const Sisd d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ================================================== SHIFTS + +// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) + +template +HWY_API Vec1 ShiftLeft(const Vec1 v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + return Vec1( + static_cast(static_cast>(v.raw) << kBits)); +} + +template +HWY_API Vec1 ShiftRight(const Vec1 v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + return Vec1(static_cast(v.raw >> kBits)); +#else + if (IsSigned()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned; + const Sisd du; + const TU shifted = BitCast(du, v).raw >> kBits; + const TU sign = BitCast(du, BroadcastSignBit(v)).raw; + const size_t sign_shift = + static_cast(static_cast(sizeof(TU)) * 8 - 1 - kBits); + const TU upper = static_cast(sign << sign_shift); + return BitCast(Sisd(), Vec1(shifted | upper)); + } else { // T is unsigned + return Vec1(static_cast(v.raw >> kBits)); + } +#endif +} + +// ------------------------------ RotateRight (ShiftRight) + +namespace detail { + +// For partial specialization: kBits == 0 results in an invalid shift count +template +struct RotateRight { + template + HWY_INLINE Vec1 operator()(const Vec1 v) const { + return Or(ShiftRight(v), ShiftLeft(v)); + } +}; + +template <> +struct RotateRight<0> { + template + HWY_INLINE Vec1 operator()(const Vec1 v) const { + return v; + } +}; + +} // namespace detail + +template +HWY_API Vec1 RotateRight(const Vec1 v) { + static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); + return detail::RotateRight()(v); +} + +// ------------------------------ ShiftLeftSame (BroadcastSignBit) + +template +HWY_API Vec1 ShiftLeftSame(const Vec1 v, int bits) { + return Vec1( + static_cast(static_cast>(v.raw) << bits)); +} + +template +HWY_API Vec1 ShiftRightSame(const Vec1 v, int bits) { +#if __cplusplus >= 202002L + // Signed right shift is now guaranteed to be arithmetic (rounding toward + // negative infinity, i.e. shifting in the sign bit). + return Vec1(static_cast(v.raw >> bits)); +#else + if (IsSigned()) { + // Emulate arithmetic shift using only logical (unsigned) shifts, because + // signed shifts are still implementation-defined. + using TU = hwy::MakeUnsigned; + const Sisd du; + const TU shifted = BitCast(du, v).raw >> bits; + const TU sign = BitCast(du, BroadcastSignBit(v)).raw; + const size_t sign_shift = + static_cast(static_cast(sizeof(TU)) * 8 - 1 - bits); + const TU upper = static_cast(sign << sign_shift); + return BitCast(Sisd(), Vec1(shifted | upper)); + } else { // T is unsigned + return Vec1(static_cast(v.raw >> bits)); + } +#endif +} + +// ------------------------------ Shl + +// Single-lane => same as ShiftLeftSame except for the argument type. +template +HWY_API Vec1 operator<<(const Vec1 v, const Vec1 bits) { + return ShiftLeftSame(v, static_cast(bits.raw)); +} + +template +HWY_API Vec1 operator>>(const Vec1 v, const Vec1 bits) { + return ShiftRightSame(v, static_cast(bits.raw)); +} + +// ================================================== ARITHMETIC + +template +HWY_API Vec1 operator+(Vec1 a, Vec1 b) { + const uint64_t a64 = static_cast(a.raw); + const uint64_t b64 = static_cast(b.raw); + return Vec1(static_cast((a64 + b64) & static_cast(~T(0)))); +} +HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { + return Vec1(a.raw + b.raw); +} +HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { + return Vec1(a.raw + b.raw); +} + +template +HWY_API Vec1 operator-(Vec1 a, Vec1 b) { + const uint64_t a64 = static_cast(a.raw); + const uint64_t b64 = static_cast(b.raw); + return Vec1(static_cast((a64 - b64) & static_cast(~T(0)))); +} +HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { + return Vec1(a.raw - b.raw); +} +HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { + return Vec1(a.raw - b.raw); +} + +// ------------------------------ SumsOf8 + +HWY_API Vec1 SumsOf8(const Vec1 v) { + return Vec1(v.raw); +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +HWY_API Vec1 SaturatedAdd(const Vec1 a, + const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); +} +HWY_API Vec1 SaturatedAdd(const Vec1 a, + const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535))); +} + +// Signed +HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); +} +HWY_API Vec1 SaturatedAdd(const Vec1 a, + const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767))); +} + +// ------------------------------ Saturating subtraction + +// Returns a - b clamped to the destination range. + +// Unsigned +HWY_API Vec1 SaturatedSub(const Vec1 a, + const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); +} +HWY_API Vec1 SaturatedSub(const Vec1 a, + const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535))); +} + +// Signed +HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); +} +HWY_API Vec1 SaturatedSub(const Vec1 a, + const Vec1 b) { + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767))); +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +HWY_API Vec1 AverageRound(const Vec1 a, + const Vec1 b) { + return Vec1(static_cast((a.raw + b.raw + 1) / 2)); +} +HWY_API Vec1 AverageRound(const Vec1 a, + const Vec1 b) { + return Vec1(static_cast((a.raw + b.raw + 1) / 2)); +} + +// ------------------------------ Absolute value + +template +HWY_API Vec1 Abs(const Vec1 a) { + const T i = a.raw; + return (i >= 0 || i == hwy::LimitsMin()) ? a : Vec1(-i); +} +HWY_API Vec1 Abs(const Vec1 a) { + return Vec1(std::abs(a.raw)); +} +HWY_API Vec1 Abs(const Vec1 a) { + return Vec1(std::abs(a.raw)); +} + +// ------------------------------ min/max + +template +HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { + return Vec1(HWY_MIN(a.raw, b.raw)); +} + +template +HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { + if (std::isnan(a.raw)) return b; + if (std::isnan(b.raw)) return a; + return Vec1(HWY_MIN(a.raw, b.raw)); +} + +template +HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { + return Vec1(HWY_MAX(a.raw, b.raw)); +} + +template +HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { + if (std::isnan(a.raw)) return b; + if (std::isnan(b.raw)) return a; + return Vec1(HWY_MAX(a.raw, b.raw)); +} + +// ------------------------------ Floating-point negate + +template +HWY_API Vec1 Neg(const Vec1 v) { + return Xor(v, SignBit(Sisd())); +} + +template +HWY_API Vec1 Neg(const Vec1 v) { + return Zero(Sisd()) - v; +} + +// ------------------------------ mul/div + +template +HWY_API Vec1 operator*(const Vec1 a, const Vec1 b) { + return Vec1(static_cast(double(a.raw) * b.raw)); +} + +template +HWY_API Vec1 operator*(const Vec1 a, const Vec1 b) { + return Vec1(static_cast(int64_t(a.raw) * b.raw)); +} + +template +HWY_API Vec1 operator*(const Vec1 a, const Vec1 b) { + return Vec1(static_cast(uint64_t(a.raw) * b.raw)); +} + +template +HWY_API Vec1 operator/(const Vec1 a, const Vec1 b) { + return Vec1(a.raw / b.raw); +} + +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec1 MulHigh(const Vec1 a, const Vec1 b) { + return Vec1(static_cast((a.raw * b.raw) >> 16)); +} +HWY_API Vec1 MulHigh(const Vec1 a, const Vec1 b) { + // Cast to uint32_t first to prevent overflow. Otherwise the result of + // uint16_t * uint16_t is in "int" which may overflow. In practice the result + // is the same but this way it is also defined. + return Vec1(static_cast( + (static_cast(a.raw) * static_cast(b.raw)) >> 16)); +} + +HWY_API Vec1 MulFixedPoint15(Vec1 a, Vec1 b) { + return Vec1(static_cast((2 * a.raw * b.raw + 32768) >> 16)); +} + +// Multiplies even lanes (0, 2 ..) and returns the double-wide result. +HWY_API Vec1 MulEven(const Vec1 a, const Vec1 b) { + const int64_t a64 = a.raw; + return Vec1(a64 * b.raw); +} +HWY_API Vec1 MulEven(const Vec1 a, const Vec1 b) { + const uint64_t a64 = a.raw; + return Vec1(a64 * b.raw); +} + +// Approximate reciprocal +HWY_API Vec1 ApproximateReciprocal(const Vec1 v) { + // Zero inputs are allowed, but callers are responsible for replacing the + // return value with something else (typically using IfThenElse). This check + // avoids a ubsan error. The return value is arbitrary. + if (v.raw == 0.0f) return Vec1(0.0f); + return Vec1(1.0f / v.raw); +} + +// Absolute value of difference. +HWY_API Vec1 AbsDiff(const Vec1 a, const Vec1 b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +template +HWY_API Vec1 MulAdd(const Vec1 mul, const Vec1 x, const Vec1 add) { + return mul * x + add; +} + +template +HWY_API Vec1 NegMulAdd(const Vec1 mul, const Vec1 x, + const Vec1 add) { + return add - mul * x; +} + +template +HWY_API Vec1 MulSub(const Vec1 mul, const Vec1 x, const Vec1 sub) { + return mul * x - sub; +} + +template +HWY_API Vec1 NegMulSub(const Vec1 mul, const Vec1 x, + const Vec1 sub) { + return Neg(mul) * x - sub; +} + +// ------------------------------ Floating-point square root + +// Approximate reciprocal square root +HWY_API Vec1 ApproximateReciprocalSqrt(const Vec1 v) { + float f = v.raw; + const float half = f * 0.5f; + uint32_t bits; + CopySameSize(&f, &bits); + // Initial guess based on log2(f) + bits = 0x5F3759DF - (bits >> 1); + CopySameSize(&bits, &f); + // One Newton-Raphson iteration + return Vec1(f * (1.5f - (half * f * f))); +} + +// Square root +HWY_API Vec1 Sqrt(const Vec1 v) { + return Vec1(std::sqrt(v.raw)); +} +HWY_API Vec1 Sqrt(const Vec1 v) { + return Vec1(std::sqrt(v.raw)); +} + +// ------------------------------ Floating-point rounding + +template +HWY_API Vec1 Round(const Vec1 v) { + using TI = MakeSigned; + if (!(Abs(v).raw < MantissaEnd())) { // Huge or NaN + return v; + } + const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast(v.raw + bias); + if (rounded == 0) return CopySignToAbs(Vec1(0), v); + // Round to even + if ((rounded & 1) && std::abs(static_cast(rounded) - v.raw) == T(0.5)) { + return Vec1(static_cast(rounded - (v.raw < T(0) ? -1 : 1))); + } + return Vec1(static_cast(rounded)); +} + +// Round-to-nearest even. +HWY_API Vec1 NearestInt(const Vec1 v) { + using T = float; + using TI = int32_t; + + const T abs = Abs(v).raw; + const bool signbit = std::signbit(v.raw); + + if (!(abs < MantissaEnd())) { // Huge or NaN + // Check if too large to cast or NaN + if (!(abs <= static_cast(LimitsMax()))) { + return Vec1(signbit ? LimitsMin() : LimitsMax()); + } + return Vec1(static_cast(v.raw)); + } + const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast(v.raw + bias); + if (rounded == 0) return Vec1(0); + // Round to even + if ((rounded & 1) && std::abs(static_cast(rounded) - v.raw) == T(0.5)) { + return Vec1(rounded - (signbit ? -1 : 1)); + } + return Vec1(rounded); +} + +template +HWY_API Vec1 Trunc(const Vec1 v) { + using TI = MakeSigned; + if (!(Abs(v).raw <= MantissaEnd())) { // Huge or NaN + return v; + } + const TI truncated = static_cast(v.raw); + if (truncated == 0) return CopySignToAbs(Vec1(0), v); + return Vec1(static_cast(truncated)); +} + +template +V Ceiling(const V v) { + const Bits kExponentMask = (1ull << kExponentBits) - 1; + const Bits kMantissaMask = (1ull << kMantissaBits) - 1; + const Bits kBias = kExponentMask / 2; + + Float f = v.raw; + const bool positive = f > Float(0.0); + + Bits bits; + CopySameSize(&v, &bits); + + const int exponent = + static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) return v; + // |v| <= 1 => 0 or 1. + if (exponent < 0) return positive ? V(1) : V(-0.0); + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) return v; + + // Clear fractional bits and round up + if (positive) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &f); + return V(f); +} + +template +V Floor(const V v) { + const Bits kExponentMask = (1ull << kExponentBits) - 1; + const Bits kMantissaMask = (1ull << kMantissaBits) - 1; + const Bits kBias = kExponentMask / 2; + + Float f = v.raw; + const bool negative = f < Float(0.0); + + Bits bits; + CopySameSize(&v, &bits); + + const int exponent = + static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) return v; + // |v| <= 1 => -1 or 0. + if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0)); + + const Bits mantissa_mask = kMantissaMask >> exponent; + // Already an integer + if ((bits & mantissa_mask) == 0) return v; + + // Clear fractional bits and round down + if (negative) bits += (kMantissaMask + 1) >> exponent; + bits &= ~mantissa_mask; + + CopySameSize(&bits, &f); + return V(f); +} + +// Toward +infinity, aka ceiling +HWY_API Vec1 Ceil(const Vec1 v) { + return Ceiling(v); +} +HWY_API Vec1 Ceil(const Vec1 v) { + return Ceiling(v); +} + +// Toward -infinity, aka floor +HWY_API Vec1 Floor(const Vec1 v) { + return Floor(v); +} +HWY_API Vec1 Floor(const Vec1 v) { + return Floor(v); +} + +// ================================================== COMPARE + +template +HWY_API Mask1 operator==(const Vec1 a, const Vec1 b) { + return Mask1::FromBool(a.raw == b.raw); +} + +template +HWY_API Mask1 operator!=(const Vec1 a, const Vec1 b) { + return Mask1::FromBool(a.raw != b.raw); +} + +template +HWY_API Mask1 TestBit(const Vec1 v, const Vec1 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +template +HWY_API Mask1 operator<(const Vec1 a, const Vec1 b) { + return Mask1::FromBool(a.raw < b.raw); +} +template +HWY_API Mask1 operator>(const Vec1 a, const Vec1 b) { + return Mask1::FromBool(a.raw > b.raw); +} + +template +HWY_API Mask1 operator<=(const Vec1 a, const Vec1 b) { + return Mask1::FromBool(a.raw <= b.raw); +} +template +HWY_API Mask1 operator>=(const Vec1 a, const Vec1 b) { + return Mask1::FromBool(a.raw >= b.raw); +} + +// ------------------------------ Floating-point classification (==) + +template +HWY_API Mask1 IsNaN(const Vec1 v) { + // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. + MakeUnsigned bits; + CopySameSize(&v, &bits); + bits += bits; + bits >>= 1; // clear sign bit + // NaN if all exponent bits are set and the mantissa is not zero. + return Mask1::FromBool(bits > ExponentMask()); +} + +HWY_API Mask1 IsInf(const Vec1 v) { + const Sisd d; + const RebindToUnsigned du; + const Vec1 vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u)); +} +HWY_API Mask1 IsInf(const Vec1 v) { + const Sisd d; + const RebindToUnsigned du; + const Vec1 vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull)); +} + +HWY_API Mask1 IsFinite(const Vec1 v) { + const Vec1 vu = BitCast(Sisd(), v); + // Shift left to clear the sign bit, check whether exponent != max value. + return Mask1::FromBool((vu.raw << 1) < 0xFF000000u); +} +HWY_API Mask1 IsFinite(const Vec1 v) { + const Vec1 vu = BitCast(Sisd(), v); + // Shift left to clear the sign bit, check whether exponent != max value. + return Mask1::FromBool((vu.raw << 1) < 0xFFE0000000000000ull); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template +HWY_API Vec1 Load(Sisd /* tag */, const T* HWY_RESTRICT aligned) { + T t; + CopySameSize(aligned, &t); + return Vec1(t); +} + +template +HWY_API Vec1 MaskedLoad(Mask1 m, Sisd d, + const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +template +HWY_API Vec1 LoadU(Sisd d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// In some use cases, "load single lane" is sufficient; otherwise avoid this. +template +HWY_API Vec1 LoadDup128(Sisd d, const T* HWY_RESTRICT aligned) { + return Load(d, aligned); +} + +// ------------------------------ Store + +template +HWY_API void Store(const Vec1 v, Sisd /* tag */, + T* HWY_RESTRICT aligned) { + CopySameSize(&v.raw, aligned); +} + +template +HWY_API void StoreU(const Vec1 v, Sisd d, T* HWY_RESTRICT p) { + return Store(v, d, p); +} + +template +HWY_API void BlendedStore(const Vec1 v, Mask1 m, Sisd d, + T* HWY_RESTRICT p) { + if (!m.bits) return; + StoreU(v, d, p); +} + +// ------------------------------ LoadInterleaved2/3/4 + +// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2. +#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED +#else +#define HWY_NATIVE_LOAD_STORE_INTERLEAVED +#endif + +template +HWY_API void LoadInterleaved2(Sisd d, const T* HWY_RESTRICT unaligned, + Vec1& v0, Vec1& v1) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); +} + +template +HWY_API void LoadInterleaved3(Sisd d, const T* HWY_RESTRICT unaligned, + Vec1& v0, Vec1& v1, Vec1& v2) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); +} + +template +HWY_API void LoadInterleaved4(Sisd d, const T* HWY_RESTRICT unaligned, + Vec1& v0, Vec1& v1, Vec1& v2, + Vec1& v3) { + v0 = LoadU(d, unaligned + 0); + v1 = LoadU(d, unaligned + 1); + v2 = LoadU(d, unaligned + 2); + v3 = LoadU(d, unaligned + 3); +} + +// ------------------------------ StoreInterleaved2/3/4 + +template +HWY_API void StoreInterleaved2(const Vec1 v0, const Vec1 v1, Sisd d, + T* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); +} + +template +HWY_API void StoreInterleaved3(const Vec1 v0, const Vec1 v1, + const Vec1 v2, Sisd d, + T* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); +} + +template +HWY_API void StoreInterleaved4(const Vec1 v0, const Vec1 v1, + const Vec1 v2, const Vec1 v3, Sisd d, + T* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); + StoreU(v3, d, unaligned + 3); +} + +// ------------------------------ Stream + +template +HWY_API void Stream(const Vec1 v, Sisd d, T* HWY_RESTRICT aligned) { + return Store(v, d, aligned); +} + +// ------------------------------ Scatter + +template +HWY_API void ScatterOffset(Vec1 v, Sisd d, T* base, + const Vec1 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + uint8_t* const base8 = reinterpret_cast(base) + offset.raw; + return Store(v, d, reinterpret_cast(base8)); +} + +template +HWY_API void ScatterIndex(Vec1 v, Sisd d, T* HWY_RESTRICT base, + const Vec1 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return Store(v, d, base + index.raw); +} + +// ------------------------------ Gather + +template +HWY_API Vec1 GatherOffset(Sisd d, const T* base, + const Vec1 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + const intptr_t addr = + reinterpret_cast(base) + static_cast(offset.raw); + return Load(d, reinterpret_cast(addr)); +} + +template +HWY_API Vec1 GatherIndex(Sisd d, const T* HWY_RESTRICT base, + const Vec1 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return Load(d, base + index.raw); +} + +// ================================================== CONVERT + +// ConvertTo and DemoteTo with floating-point input and integer output truncate +// (rounding toward zero). + +template +HWY_API Vec1 PromoteTo(Sisd /* tag */, Vec1 from) { + static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting"); + // For bits Y > X, floatX->floatY and intX->intY are always representable. + return Vec1(static_cast(from.raw)); +} + +// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here, +// so we overload for FromT=double and ToT={float,int32_t}. +HWY_API Vec1 DemoteTo(Sisd /* tag */, Vec1 from) { + // Prevent ubsan errors when converting float to narrower integer/float + if (std::isinf(from.raw) || + std::fabs(from.raw) > static_cast(HighestValue())) { + return Vec1(std::signbit(from.raw) ? LowestValue() + : HighestValue()); + } + return Vec1(static_cast(from.raw)); +} +HWY_API Vec1 DemoteTo(Sisd /* tag */, Vec1 from) { + // Prevent ubsan errors when converting int32_t to narrower integer/int32_t + if (std::isinf(from.raw) || + std::fabs(from.raw) > static_cast(HighestValue())) { + return Vec1(std::signbit(from.raw) ? LowestValue() + : HighestValue()); + } + return Vec1(static_cast(from.raw)); +} + +template +HWY_API Vec1 DemoteTo(Sisd /* tag */, Vec1 from) { + static_assert(!IsFloat(), "FromT=double are handled above"); + static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting"); + + // Int to int: choose closest value in ToT to `from` (avoids UB) + from.raw = HWY_MIN(HWY_MAX(LimitsMin(), from.raw), LimitsMax()); + return Vec1(static_cast(from.raw)); +} + +HWY_API Vec1 PromoteTo(Sisd /* tag */, const Vec1 v) { + uint16_t bits16; + CopySameSize(&v.raw, &bits16); + const uint32_t sign = static_cast(bits16 >> 15); + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = + (1.0f / 16384) * (static_cast(mantissa) * (1.0f / 1024)); + return Vec1(sign ? -subnormal : subnormal); + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + float out; + CopySameSize(&bits32, &out); + return Vec1(out); +} + +HWY_API Vec1 PromoteTo(Sisd d, const Vec1 v) { + return Set(d, F32FromBF16(v.raw)); +} + +HWY_API Vec1 DemoteTo(Sisd /* tag */, + const Vec1 v) { + uint32_t bits32; + CopySameSize(&v.raw, &bits32); + const uint32_t sign = bits32 >> 31; + const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; + const uint32_t mantissa32 = bits32 & 0x7FFFFF; + + const int32_t exp = HWY_MIN(static_cast(biased_exp32) - 127, 15); + + // Tiny or zero => zero. + Vec1 out; + if (exp < -24) { + const uint16_t zero = 0; + CopySameSize(&zero, &out.raw); + return out; + } + + uint32_t biased_exp16, mantissa16; + + // exp = [-24, -15] => subnormal + if (exp < -14) { + biased_exp16 = 0; + const uint32_t sub_exp = static_cast(-14 - exp); + HWY_DASSERT(1 <= sub_exp && sub_exp < 11); + mantissa16 = static_cast((1u << (10 - sub_exp)) + + (mantissa32 >> (13 + sub_exp))); + } else { + // exp = [-14, 15] + biased_exp16 = static_cast(exp + 15); + HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31); + mantissa16 = mantissa32 >> 13; + } + + HWY_DASSERT(mantissa16 < 1024); + const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; + HWY_DASSERT(bits16 < 0x10000); + const uint16_t narrowed = static_cast(bits16); // big-endian safe + CopySameSize(&narrowed, &out.raw); + return out; +} + +HWY_API Vec1 DemoteTo(Sisd d, const Vec1 v) { + return Set(d, BF16FromF32(v.raw)); +} + +template +HWY_API Vec1 ConvertTo(Sisd /* tag */, Vec1 from) { + static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size"); + // float## -> int##: return closest representable value. We cannot exactly + // represent LimitsMax in FromT, so use double. + const double f = static_cast(from.raw); + if (std::isinf(from.raw) || + std::fabs(f) > static_cast(LimitsMax())) { + return Vec1(std::signbit(from.raw) ? LimitsMin() + : LimitsMax()); + } + return Vec1(static_cast(from.raw)); +} + +template +HWY_API Vec1 ConvertTo(Sisd /* tag */, Vec1 from) { + static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size"); + // int## -> float##: no check needed + return Vec1(static_cast(from.raw)); +} + +HWY_API Vec1 U8FromU32(const Vec1 v) { + return DemoteTo(Sisd(), v); +} + +// ------------------------------ Truncations + +HWY_API Vec1 TruncateTo(Sisd /* tag */, + const Vec1 v) { + return Vec1{static_cast(v.raw & 0xFF)}; +} + +HWY_API Vec1 TruncateTo(Sisd /* tag */, + const Vec1 v) { + return Vec1{static_cast(v.raw & 0xFFFF)}; +} + +HWY_API Vec1 TruncateTo(Sisd /* tag */, + const Vec1 v) { + return Vec1{static_cast(v.raw & 0xFFFFFFFFu)}; +} + +HWY_API Vec1 TruncateTo(Sisd /* tag */, + const Vec1 v) { + return Vec1{static_cast(v.raw & 0xFF)}; +} + +HWY_API Vec1 TruncateTo(Sisd /* tag */, + const Vec1 v) { + return Vec1{static_cast(v.raw & 0xFFFF)}; +} + +HWY_API Vec1 TruncateTo(Sisd /* tag */, + const Vec1 v) { + return Vec1{static_cast(v.raw & 0xFF)}; +} + +// ================================================== COMBINE +// UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported. + +template +HWY_API Vec1 LowerHalf(Vec1 v) { + return v; +} + +template +HWY_API Vec1 LowerHalf(Sisd /* tag */, Vec1 v) { + return v; +} + +// ================================================== SWIZZLE + +template +HWY_API T GetLane(const Vec1 v) { + return v.raw; +} + +template +HWY_API T ExtractLane(const Vec1 v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return v.raw; +} + +template +HWY_API Vec1 InsertLane(Vec1 v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + v.raw = t; + return v; +} + +template +HWY_API Vec1 DupEven(Vec1 v) { + return v; +} +// DupOdd is unsupported. + +template +HWY_API Vec1 OddEven(Vec1 /* odd */, Vec1 even) { + return even; +} + +template +HWY_API Vec1 OddEvenBlocks(Vec1 /* odd */, Vec1 even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec1 SwapAdjacentBlocks(Vec1 v) { + return v; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template +struct Indices1 { + MakeSigned raw; +}; + +template +HWY_API Indices1 IndicesFromVec(Sisd, Vec1 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size"); + HWY_DASSERT(vec.raw == 0); + return Indices1{vec.raw}; +} + +template +HWY_API Indices1 SetTableIndices(Sisd d, const TI* idx) { + return IndicesFromVec(d, LoadU(Sisd(), idx)); +} + +template +HWY_API Vec1 TableLookupLanes(const Vec1 v, const Indices1 /* idx */) { + return v; +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template +HWY_API Vec1 ReverseBlocks(Sisd /* tag */, const Vec1 v) { + return v; +} + +// ------------------------------ Reverse + +template +HWY_API Vec1 Reverse(Sisd /* tag */, const Vec1 v) { + return v; +} + +// Must not be called: +template +HWY_API Vec1 Reverse2(Sisd /* tag */, const Vec1 v) { + return v; +} + +template +HWY_API Vec1 Reverse4(Sisd /* tag */, const Vec1 v) { + return v; +} + +template +HWY_API Vec1 Reverse8(Sisd /* tag */, const Vec1 v) { + return v; +} + +// ================================================== BLOCKWISE +// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported. + +// ------------------------------ Broadcast/splat any lane + +template +HWY_API Vec1 Broadcast(const Vec1 v) { + static_assert(kLane == 0, "Scalar only has one lane"); + return v; +} + +// ------------------------------ TableLookupBytes, TableLookupBytesOr0 + +template +HWY_API Vec1 TableLookupBytes(const Vec1 in, const Vec1 indices) { + uint8_t in_bytes[sizeof(T)]; + uint8_t idx_bytes[sizeof(T)]; + uint8_t out_bytes[sizeof(T)]; + CopyBytes(&in, &in_bytes); // copy to bytes + CopyBytes(&indices, &idx_bytes); + for (size_t i = 0; i < sizeof(T); ++i) { + out_bytes[i] = in_bytes[idx_bytes[i]]; + } + TI out; + CopyBytes(&out_bytes, &out); + return Vec1{out}; +} + +template +HWY_API Vec1 TableLookupBytesOr0(const Vec1 in, const Vec1 indices) { + uint8_t in_bytes[sizeof(T)]; + uint8_t idx_bytes[sizeof(T)]; + uint8_t out_bytes[sizeof(T)]; + CopyBytes(&in, &in_bytes); // copy to bytes + CopyBytes(&indices, &idx_bytes); + for (size_t i = 0; i < sizeof(T); ++i) { + out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]]; + } + TI out; + CopyBytes(&out_bytes, &out); + return Vec1{out}; +} + +// ------------------------------ ZipLower + +HWY_API Vec1 ZipLower(const Vec1 a, const Vec1 b) { + return Vec1(static_cast((uint32_t(b.raw) << 8) + a.raw)); +} +HWY_API Vec1 ZipLower(const Vec1 a, + const Vec1 b) { + return Vec1((uint32_t(b.raw) << 16) + a.raw); +} +HWY_API Vec1 ZipLower(const Vec1 a, + const Vec1 b) { + return Vec1((uint64_t(b.raw) << 32) + a.raw); +} +HWY_API Vec1 ZipLower(const Vec1 a, const Vec1 b) { + return Vec1(static_cast((int32_t(b.raw) << 8) + a.raw)); +} +HWY_API Vec1 ZipLower(const Vec1 a, const Vec1 b) { + return Vec1((int32_t(b.raw) << 16) + a.raw); +} +HWY_API Vec1 ZipLower(const Vec1 a, const Vec1 b) { + return Vec1((int64_t(b.raw) << 32) + a.raw); +} + +template , class VW = Vec1> +HWY_API VW ZipLower(Sisd /* tag */, Vec1 a, Vec1 b) { + return VW(static_cast((TW{b.raw} << (sizeof(T) * 8)) + a.raw)); +} + +// ================================================== MASK + +template +HWY_API bool AllFalse(Sisd /* tag */, const Mask1 mask) { + return mask.bits == 0; +} + +template +HWY_API bool AllTrue(Sisd /* tag */, const Mask1 mask) { + return mask.bits != 0; +} + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask1 LoadMaskBits(Sisd /* tag */, + const uint8_t* HWY_RESTRICT bits) { + return Mask1::FromBool((bits[0] & 1) != 0); +} + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(Sisd d, const Mask1 mask, uint8_t* bits) { + *bits = AllTrue(d, mask); + return 1; +} + +template +HWY_API size_t CountTrue(Sisd /* tag */, const Mask1 mask) { + return mask.bits == 0 ? 0 : 1; +} + +template +HWY_API intptr_t FindFirstTrue(Sisd /* tag */, const Mask1 mask) { + return mask.bits == 0 ? -1 : 0; +} + +template +HWY_API size_t FindKnownFirstTrue(Sisd /* tag */, const Mask1 /* m */) { + return 0; // There is only one lane and we know it is true. +} + +// ------------------------------ Compress, CompressBits + +template +struct CompressIsPartition { + enum { value = 1 }; +}; + +template +HWY_API Vec1 Compress(Vec1 v, const Mask1 /* mask */) { + // A single lane is already partitioned by definition. + return v; +} + +template +HWY_API Vec1 CompressNot(Vec1 v, const Mask1 /* mask */) { + // A single lane is already partitioned by definition. + return v; +} + +// ------------------------------ CompressStore +template +HWY_API size_t CompressStore(Vec1 v, const Mask1 mask, Sisd d, + T* HWY_RESTRICT unaligned) { + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(Vec1 v, const Mask1 mask, Sisd d, + T* HWY_RESTRICT unaligned) { + if (!mask.bits) return 0; + StoreU(v, d, unaligned); + return 1; +} + +// ------------------------------ CompressBits +template +HWY_API Vec1 CompressBits(Vec1 v, const uint8_t* HWY_RESTRICT /*bits*/) { + return v; +} + +// ------------------------------ CompressBitsStore +template +HWY_API size_t CompressBitsStore(Vec1 v, const uint8_t* HWY_RESTRICT bits, + Sisd d, T* HWY_RESTRICT unaligned) { + const Mask1 mask = LoadMaskBits(d, bits); + StoreU(Compress(v, mask), d, unaligned); + return CountTrue(d, mask); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +HWY_API Vec1 ReorderWidenMulAccumulate(Sisd /* tag */, + Vec1 a, + Vec1 b, + const Vec1 sum0, + Vec1& /* sum1 */) { + return MulAdd(Vec1(F32FromBF16(a.raw)), + Vec1(F32FromBF16(b.raw)), sum0); +} + +HWY_API Vec1 ReorderWidenMulAccumulate(Sisd /* tag */, + Vec1 a, + Vec1 b, + const Vec1 sum0, + Vec1& /* sum1 */) { + return Vec1(a.raw * b.raw + sum0.raw); +} + +// ================================================== REDUCTIONS + +// Sum of all lanes, i.e. the only one. +template +HWY_API Vec1 SumOfLanes(Sisd /* tag */, const Vec1 v) { + return v; +} +template +HWY_API Vec1 MinOfLanes(Sisd /* tag */, const Vec1 v) { + return v; +} +template +HWY_API Vec1 MaxOfLanes(Sisd /* tag */, const Vec1 v) { + return v; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/set_macros-inl.h b/hwy/ops/set_macros-inl.h new file mode 100644 index 0000000..c118960 --- /dev/null +++ b/hwy/ops/set_macros-inl.h @@ -0,0 +1,444 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Sets macros based on HWY_TARGET. + +// This include guard is toggled by foreach_target, so avoid the usual _H_ +// suffix to prevent copybara from renaming it. +#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE) +#ifdef HWY_SET_MACROS_PER_TARGET +#undef HWY_SET_MACROS_PER_TARGET +#else +#define HWY_SET_MACROS_PER_TARGET +#endif + +#endif // HWY_SET_MACROS_PER_TARGET + +#include "hwy/detect_targets.h" + +#undef HWY_NAMESPACE +#undef HWY_ALIGN +#undef HWY_MAX_BYTES +#undef HWY_LANES + +#undef HWY_HAVE_SCALABLE +#undef HWY_HAVE_INTEGER64 +#undef HWY_HAVE_FLOAT16 +#undef HWY_HAVE_FLOAT64 +#undef HWY_MEM_OPS_MIGHT_FAULT +#undef HWY_NATIVE_FMA +#undef HWY_CAP_GE256 +#undef HWY_CAP_GE512 + +#undef HWY_TARGET_STR + +#if defined(HWY_DISABLE_PCLMUL_AES) +#define HWY_TARGET_STR_PCLMUL_AES "" +#else +#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes" +#endif + +#if defined(HWY_DISABLE_BMI2_FMA) +#define HWY_TARGET_STR_BMI2_FMA "" +#else +#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma" +#endif + +#if defined(HWY_DISABLE_F16C) +#define HWY_TARGET_STR_F16C "" +#else +#define HWY_TARGET_STR_F16C ",f16c" +#endif + +#define HWY_TARGET_STR_SSSE3 "sse2,ssse3" + +#define HWY_TARGET_STR_SSE4 \ + HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES +// Include previous targets, which are the half-vectors of the next target. +#define HWY_TARGET_STR_AVX2 \ + HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C +#define HWY_TARGET_STR_AVX3 \ + HWY_TARGET_STR_AVX2 ",avx512f,avx512vl,avx512dq,avx512bw" + +// Before include guard so we redefine HWY_TARGET_STR on each include, +// governed by the current HWY_TARGET. + +//----------------------------------------------------------------------------- +// SSSE3 +#if HWY_TARGET == HWY_SSSE3 + +#define HWY_NAMESPACE N_SSSE3 +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3 + +//----------------------------------------------------------------------------- +// SSE4 +#elif HWY_TARGET == HWY_SSE4 + +#define HWY_NAMESPACE N_SSE4 +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_TARGET_STR HWY_TARGET_STR_SSE4 + +//----------------------------------------------------------------------------- +// AVX2 +#elif HWY_TARGET == HWY_AVX2 + +#define HWY_NAMESPACE N_AVX2 +#define HWY_ALIGN alignas(32) +#define HWY_MAX_BYTES 32 +#define HWY_LANES(T) (32 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 + +#ifdef HWY_DISABLE_BMI2_FMA +#define HWY_NATIVE_FMA 0 +#else +#define HWY_NATIVE_FMA 1 +#endif + +#define HWY_CAP_GE256 1 +#define HWY_CAP_GE512 0 + +#define HWY_TARGET_STR HWY_TARGET_STR_AVX2 + +//----------------------------------------------------------------------------- +// AVX3[_DL] +#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL + +#define HWY_ALIGN alignas(64) +#define HWY_MAX_BYTES 64 +#define HWY_LANES(T) (64 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 1 +#define HWY_CAP_GE512 1 + +#if HWY_TARGET == HWY_AVX3 + +#define HWY_NAMESPACE N_AVX3 +#define HWY_TARGET_STR HWY_TARGET_STR_AVX3 + +#elif HWY_TARGET == HWY_AVX3_DL + +#define HWY_NAMESPACE N_AVX3_DL +#define HWY_TARGET_STR \ + HWY_TARGET_STR_AVX3 \ + ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avxvnni,avx512bitalg," \ + "avx512vpopcntdq" + +#else +#error "Logic error" +#endif // HWY_TARGET == HWY_AVX3_DL + +//----------------------------------------------------------------------------- +// PPC8 +#elif HWY_TARGET == HWY_PPC8 + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 0 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_PPC8 + +#define HWY_TARGET_STR "altivec,vsx" + +//----------------------------------------------------------------------------- +// NEON +#elif HWY_TARGET == HWY_NEON + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 + +#if HWY_ARCH_ARM_A64 +#define HWY_HAVE_FLOAT64 1 +#else +#define HWY_HAVE_FLOAT64 0 +#endif + +#define HWY_MEM_OPS_MIGHT_FAULT 1 + +#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 +#define HWY_NATIVE_FMA 1 +#else +#define HWY_NATIVE_FMA 0 +#endif + +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_NEON + +// Can use pragmas instead of -march compiler flag +#if HWY_HAVE_RUNTIME_DISPATCH +#if HWY_ARCH_ARM_V7 +#define HWY_TARGET_STR "+neon-vfpv4" +#else +#define HWY_TARGET_STR "+crypto" +#endif // HWY_ARCH_ARM_V7 +#else +// HWY_TARGET_STR remains undefined +#endif + +//----------------------------------------------------------------------------- +// SVE[2] +#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \ + HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 + +// SVE only requires lane alignment, not natural alignment of the entire vector. +#define HWY_ALIGN alignas(8) + +// Value ensures MaxLanes() is the tightest possible upper bound to reduce +// overallocation. +#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T)) + +#define HWY_HAVE_SCALABLE 1 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if HWY_TARGET == HWY_SVE2 +#define HWY_NAMESPACE N_SVE2 +#define HWY_MAX_BYTES 256 +#elif HWY_TARGET == HWY_SVE_256 +#define HWY_NAMESPACE N_SVE_256 +#define HWY_MAX_BYTES 32 +#elif HWY_TARGET == HWY_SVE2_128 +#define HWY_NAMESPACE N_SVE2_128 +#define HWY_MAX_BYTES 16 +#else +#define HWY_NAMESPACE N_SVE +#define HWY_MAX_BYTES 256 +#endif + +// Can use pragmas instead of -march compiler flag +#if HWY_HAVE_RUNTIME_DISPATCH +#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 +#define HWY_TARGET_STR "+sve2-aes" +#else +#define HWY_TARGET_STR "+sve" +#endif +#else +// HWY_TARGET_STR remains undefined +#endif + +//----------------------------------------------------------------------------- +// WASM +#elif HWY_TARGET == HWY_WASM + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 0 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_WASM + +#define HWY_TARGET_STR "simd128" + +//----------------------------------------------------------------------------- +// WASM_EMU256 +#elif HWY_TARGET == HWY_WASM_EMU256 + +#define HWY_ALIGN alignas(32) +#define HWY_MAX_BYTES 32 +#define HWY_LANES(T) (32 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 0 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_WASM_EMU256 + +#define HWY_TARGET_STR "simd128" + +//----------------------------------------------------------------------------- +// RVV +#elif HWY_TARGET == HWY_RVV + +// RVV only requires lane alignment, not natural alignment of the entire vector, +// and the compiler already aligns builtin types, so nothing to do here. +#define HWY_ALIGN + +// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8). +#define HWY_MAX_BYTES 65536 + +// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual +// LMUL. This is the tightest possible upper bound. +#define HWY_LANES(T) (8192 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 1 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 1 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if defined(__riscv_zvfh) +#define HWY_HAVE_FLOAT16 1 +#else +#define HWY_HAVE_FLOAT16 0 +#endif + +#define HWY_NAMESPACE N_RVV + +// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. +// (rv64gcv is not a valid target) + +//----------------------------------------------------------------------------- +// EMU128 +#elif HWY_TARGET == HWY_EMU128 + +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#define HWY_LANES(T) (16 / sizeof(T)) + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_EMU128 + +// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. + +//----------------------------------------------------------------------------- +// SCALAR +#elif HWY_TARGET == HWY_SCALAR + +#define HWY_ALIGN +#define HWY_MAX_BYTES 8 +#define HWY_LANES(T) 1 + +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#define HWY_NAMESPACE N_SCALAR + +// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. + +#else +#pragma message("HWY_TARGET does not match any known target") +#endif // HWY_TARGET + +// Override this to 1 in asan/msan builds, which will still fault. +#if HWY_IS_ASAN || HWY_IS_MSAN +#undef HWY_MEM_OPS_MIGHT_FAULT +#define HWY_MEM_OPS_MIGHT_FAULT 1 +#endif + +// Clang <9 requires this be invoked at file scope, before any namespace. +#undef HWY_BEFORE_NAMESPACE +#if defined(HWY_TARGET_STR) +#define HWY_BEFORE_NAMESPACE() \ + HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \ + static_assert(true, "For requiring trailing semicolon") +#else +// avoids compiler warning if no HWY_TARGET_STR +#define HWY_BEFORE_NAMESPACE() \ + static_assert(true, "For requiring trailing semicolon") +#endif + +// Clang <9 requires any namespaces be closed before this macro. +#undef HWY_AFTER_NAMESPACE +#if defined(HWY_TARGET_STR) +#define HWY_AFTER_NAMESPACE() \ + HWY_POP_ATTRIBUTES \ + static_assert(true, "For requiring trailing semicolon") +#else +// avoids compiler warning if no HWY_TARGET_STR +#define HWY_AFTER_NAMESPACE() \ + static_assert(true, "For requiring trailing semicolon") +#endif + +#undef HWY_ATTR +#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target) +#define HWY_ATTR __attribute__((target(HWY_TARGET_STR))) +#else +#define HWY_ATTR +#endif diff --git a/hwy/ops/shared-inl.h b/hwy/ops/shared-inl.h new file mode 100644 index 0000000..29c4303 --- /dev/null +++ b/hwy/ops/shared-inl.h @@ -0,0 +1,311 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target definitions shared by ops/*.h and user code. + +#include + +#include "hwy/base.h" + +// Separate header because foreach_target.h re-enables its include guard. +#include "hwy/ops/set_macros-inl.h" + +// Relies on the external include guard in highway.h. +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Highway operations are implemented as overloaded functions selected using an +// internal-only tag type D := Simd. T is the lane type. kPow2 is a +// shift count applied to scalable vectors. Instead of referring to Simd<> +// directly, users create D via aliases ScalableTag() (defaults to a +// full vector, or fractions/groups if the argument is negative/positive), +// CappedTag or FixedTag. The actual number of lanes is +// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a +// cap. For constexpr-size vectors, N is the actual number of lanes. This +// ensures Half> is the same type as Full256, as required by x86. +template +struct Simd { + constexpr Simd() = default; + using T = Lane; + static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two"); + + // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC + // warns when using enums and non-enums in the same expression. Cannot be + // static constexpr function (another MSVC limitation). + static constexpr size_t kPrivateN = N; + static constexpr int kPrivatePow2 = kPow2; + + template + static constexpr size_t NewN() { + // Round up to correctly handle scalars with N=1. + return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT); + } + +#if HWY_HAVE_SCALABLE + template + static constexpr int Pow2Ratio() { + return (sizeof(NewT) > sizeof(T)) + ? static_cast(CeilLog2(sizeof(NewT) / sizeof(T))) + : -static_cast(CeilLog2(sizeof(T) / sizeof(NewT))); + } +#endif + + // Widening/narrowing ops change the number of lanes and/or their type. + // To initialize such vectors, we need the corresponding tag types: + +// PromoteTo/DemoteTo() with another lane type, but same number of lanes. +#if HWY_HAVE_SCALABLE + template + using Rebind = Simd()>; +#else + template + using Rebind = Simd; +#endif + + // Change lane type while keeping the same vector size, e.g. for MulEven. + template + using Repartition = Simd(), kPow2>; + +// Half the lanes while keeping the same lane type, e.g. for LowerHalf. +// Round up to correctly handle scalars with N=1. +#if HWY_HAVE_SCALABLE + // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN, + // then we expect Half> to have N/2 lanes (rounded up). + using Half = Simd; +#else + using Half = Simd; +#endif + +// Twice the lanes while keeping the same lane type, e.g. for Combine. +#if HWY_HAVE_SCALABLE + using Twice = Simd; +#else + using Twice = Simd; +#endif +}; + +namespace detail { + +template +constexpr bool IsFull(Simd /* d */) { + return N == HWY_LANES(T) && kPow2 == 0; +} + +// Returns the number of lanes (possibly zero) after applying a shift: +// - 0: no change; +// - [1,3]: a group of 2,4,8 [fractional] vectors; +// - [-3,-1]: a fraction of a vector from 1/8 to 1/2. +constexpr size_t ScaleByPower(size_t N, int pow2) { +#if HWY_TARGET == HWY_RVV + return pow2 >= 0 ? (N << pow2) : (N >> (-pow2)); +#else + return pow2 >= 0 ? N : (N >> (-pow2)); +#endif +} + +// Struct wrappers enable validation of arguments via static_assert. +template +struct ScalableTagChecker { + static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8"); +#if HWY_TARGET == HWY_RVV + // Only RVV supports register groups. + using type = Simd; +#elif HWY_HAVE_SCALABLE + // For SVE[2], only allow full or fractions. + using type = Simd; +#elif HWY_TARGET == HWY_SCALAR + using type = Simd; +#else + // Only allow full or fractions. + using type = Simd; +#endif +}; + +template +struct CappedTagChecker { + static_assert(kLimit != 0, "Does not make sense to have zero lanes"); + // Safely handle non-power-of-two inputs by rounding down, which is allowed by + // CappedTag. Otherwise, Simd would static_assert. + static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit); + using type = Simd; +}; + +template +struct FixedTagChecker { + static_assert(kNumLanes != 0, "Does not make sense to have zero lanes"); + static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes"); + using type = Simd; +}; + +} // namespace detail + +// Alias for a tag describing a full vector (kPow2 == 0: the most common usage, +// e.g. 1D loops where the application does not care about the vector size) or a +// fraction/multiple of one. Multiples are the same as full vectors for all +// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return +// value of type promotion and demotion. +template +using ScalableTag = typename detail::ScalableTagChecker::type; + +// Alias for a tag describing a vector with *up to* kLimit active lanes, even on +// targets with scalable vectors and HWY_SCALAR. The runtime lane count +// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is +// typically used for 1D loops with a relatively low application-defined upper +// bound, e.g. for 8x8 DCTs. However, it is better if data structures are +// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are +// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63; +// this would enable vector-length-agnostic loops using ScalableTag). +template +using CappedTag = typename detail::CappedTagChecker::type; + +// Alias for a tag describing a vector with *exactly* kNumLanes active lanes, +// even on targets with scalable vectors. Requires `kNumLanes` to be a power of +// two not exceeding `HWY_LANES(T)`. +// +// NOTE: if the application does not need to support HWY_SCALAR (+), use this +// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes. +// This is useful for data structures that rely on exactly 128-bit SIMD, but +// these are discouraged because they cannot benefit from wider vectors. +// Instead, applications would ideally define a larger problem size and loop +// over it with the (unknown size) vectors from ScalableTag. +// +// + e.g. if the baseline is known to support SIMD, or the application requires +// ops such as TableLookupBytes not supported by HWY_SCALAR. +template +using FixedTag = typename detail::FixedTagChecker::type; + +template +using TFromD = typename D::T; + +// Tag for the same number of lanes as D, but with the LaneType T. +template +using Rebind = typename D::template Rebind; + +template +using RebindToSigned = Rebind>, D>; +template +using RebindToUnsigned = Rebind>, D>; +template +using RebindToFloat = Rebind>, D>; + +// Tag for the same total size as D, but with the LaneType T. +template +using Repartition = typename D::template Repartition; + +template +using RepartitionToWide = Repartition>, D>; +template +using RepartitionToNarrow = Repartition>, D>; + +// Tag for the same lane type as D, but half the lanes. +template +using Half = typename D::Half; + +// Tag for the same lane type as D, but twice the lanes. +template +using Twice = typename D::Twice; + +template +using Full32 = Simd; + +template +using Full64 = Simd; + +template +using Full128 = Simd; + +// Same as base.h macros but with a Simd argument instead of T. +#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD) +#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD) +#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD) +#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD) +#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD, bytes) +#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD, bytes) + +// MSVC workaround: use PrivateN directly instead of MaxLanes. +#define HWY_IF_LT128_D(D) \ + hwy::EnableIf) < 16>* = nullptr +#define HWY_IF_GE128_D(D) \ + hwy::EnableIf) >= 16>* = nullptr + +// Same, but with a vector argument. ops/*-inl.h define their own TFromV. +#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV) +#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV) +#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV) +#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV, bytes) +#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV, bytes) + +template +HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) { + return D::kPrivatePow2; +} + +// MSVC requires the explicit . +#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf(D()) >= (MIN)>* = nullptr + +#if HWY_HAVE_SCALABLE + +// Upper bound on the number of lanes. Intended for template arguments and +// reducing code size (e.g. for SSE4, we know at compile-time that vectors will +// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the +// actual size for allocating storage. WARNING: MSVC might not be able to deduce +// arguments if this is used in EnableIf. See HWY_IF_LT128_D above. +template +HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { + return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD)), + D::kPrivatePow2); +} + +#else +// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N +// is not an option, nor does a member function work. +template +HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { + return D::kPrivateN; +} + +// (Potentially) non-constant actual size of the vector at runtime, subject to +// the limit imposed by the Simd. Useful for advancing loop counters. +// Targets with scalable vectors define this themselves. +template +HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd) { + return N; +} + +#endif // !HWY_HAVE_SCALABLE + +// NOTE: GCC generates incorrect code for vector arguments to non-inlined +// functions in two situations: +// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads: +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412. +// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not +// all) tests to fail. +// +// We therefore pass by const& only on GCC and (Windows or ARM64). This alias +// must be used for all vector/mask parameters of functions marked HWY_NOINLINE, +// and possibly also other functions that are not inlined. +#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64) +template +using VecArg = const V&; +#else +template +using VecArg = V; +#endif + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h new file mode 100644 index 0000000..3831258 --- /dev/null +++ b/hwy/ops/wasm_128-inl.h @@ -0,0 +1,4589 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 128-bit WASM vectors and operations. +// External include guard in highway.h - see comment there. + +#include +#include +#include + +#include "hwy/base.h" +#include "hwy/ops/shared-inl.h" + +#ifdef HWY_WASM_OLD_NAMES +#define wasm_i8x16_shuffle wasm_v8x16_shuffle +#define wasm_i16x8_shuffle wasm_v16x8_shuffle +#define wasm_i32x4_shuffle wasm_v32x4_shuffle +#define wasm_i64x2_shuffle wasm_v64x2_shuffle +#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16 +#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8 +#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8 +#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16 +#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8 +#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8 +#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4 +#define wasm_u8x16_add_sat wasm_u8x16_add_saturate +#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate +#define wasm_u16x8_add_sat wasm_u16x8_add_saturate +#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate +#define wasm_i8x16_add_sat wasm_i8x16_add_saturate +#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate +#define wasm_i16x8_add_sat wasm_i16x8_add_saturate +#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +namespace detail { + +template +struct Raw128 { + using type = __v128_u; +}; +template <> +struct Raw128 { + using type = __f32x4; +}; + +} // namespace detail + +template +class Vec128 { + using Raw = typename detail::Raw128::type; + + public: + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +template +using Vec64 = Vec128; + +template +using Vec32 = Vec128; + +// FF..FF or 0. +template +struct Mask128 { + typename detail::Raw128::type raw; +}; + +namespace detail { + +// Deduce Simd from Vec128 +struct DeduceD { + template + Simd operator()(Vec128) const { + return Simd(); + } +}; + +} // namespace detail + +template +using DFromV = decltype(detail::DeduceD()(V())); + +template +using TFromV = TFromD>; + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; } +HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) { + return static_cast<__v128_u>(v); +} +HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) { + return static_cast<__v128_u>(v); +} + +template +HWY_INLINE Vec128 BitCastToByte(Vec128 v) { + return Vec128{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template +struct BitCastFromInteger128 { + HWY_INLINE __v128_u operator()(__v128_u v) { return v; } +}; +template <> +struct BitCastFromInteger128 { + HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); } +}; + +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128{BitCastFromInteger128()(v.raw)}; +} + +} // namespace detail + +template +HWY_API Vec128 BitCast(Simd d, + Vec128 v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Zero + +// Returns an all-zero vector/part. +template +HWY_API Vec128 Zero(Simd /* tag */) { + return Vec128{wasm_i32x4_splat(0)}; +} +template +HWY_API Vec128 Zero(Simd /* tag */) { + return Vec128{wasm_f32x4_splat(0.0f)}; +} + +template +using VFromD = decltype(Zero(D())); + +// ------------------------------ Set + +// Returns a vector/part with all lanes set to "t". +template +HWY_API Vec128 Set(Simd /* tag */, const uint8_t t) { + return Vec128{wasm_i8x16_splat(static_cast(t))}; +} +template +HWY_API Vec128 Set(Simd /* tag */, + const uint16_t t) { + return Vec128{wasm_i16x8_splat(static_cast(t))}; +} +template +HWY_API Vec128 Set(Simd /* tag */, + const uint32_t t) { + return Vec128{wasm_i32x4_splat(static_cast(t))}; +} +template +HWY_API Vec128 Set(Simd /* tag */, + const uint64_t t) { + return Vec128{wasm_i64x2_splat(static_cast(t))}; +} + +template +HWY_API Vec128 Set(Simd /* tag */, const int8_t t) { + return Vec128{wasm_i8x16_splat(t)}; +} +template +HWY_API Vec128 Set(Simd /* tag */, const int16_t t) { + return Vec128{wasm_i16x8_splat(t)}; +} +template +HWY_API Vec128 Set(Simd /* tag */, const int32_t t) { + return Vec128{wasm_i32x4_splat(t)}; +} +template +HWY_API Vec128 Set(Simd /* tag */, const int64_t t) { + return Vec128{wasm_i64x2_splat(t)}; +} + +template +HWY_API Vec128 Set(Simd /* tag */, const float t) { + return Vec128{wasm_f32x4_splat(t)}; +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template +HWY_API Vec128 Undefined(Simd d) { + return Zero(d); +} + +HWY_DIAGNOSTICS(pop) + +// Returns a vector with lane i=[0, N) set to "first" + i. +template +Vec128 Iota(const Simd d, const T2 first) { + HWY_ALIGN T lanes[16 / sizeof(T)]; + for (size_t i = 0; i < 16 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i8x16_add(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i16x8_add(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_add(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i64x2_add(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i8x16_add(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i16x8_add(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_add(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i64x2_add(a.raw, b.raw)}; +} + +// Float +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_f32x4_add(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(Vec128 a, + Vec128 b) { + return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; +} + +// Float +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_f32x4_sub(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_u8x16_add_sat(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_u16x8_add_sat(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i8x16_add_sat(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i16x8_add_sat(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_u8x16_sub_sat(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_u16x8_sub_sat(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i8x16_sub_sat(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i16x8_sub_sat(a.raw, b.raw)}; +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +// Unsigned +template +HWY_API Vec128 AverageRound(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_u8x16_avgr(a.raw, b.raw)}; +} +template +HWY_API Vec128 AverageRound(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_u16x8_avgr(a.raw, b.raw)}; +} + +// ------------------------------ Absolute value + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{wasm_i8x16_abs(v.raw)}; +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{wasm_i16x8_abs(v.raw)}; +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{wasm_i32x4_abs(v.raw)}; +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{wasm_i64x2_abs(v.raw)}; +} + +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{wasm_f32x4_abs(v.raw)}; +} + +// ------------------------------ Shift lanes by constant #bits + +// Unsigned +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{wasm_i16x8_shl(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{wasm_u16x8_shr(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{wasm_i32x4_shl(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{wasm_i64x2_shl(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{wasm_u32x4_shr(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{wasm_u64x2_shr(v.raw, kBits)}; +} + +// Signed +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{wasm_i16x8_shl(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{wasm_i16x8_shr(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{wasm_i32x4_shl(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{wasm_i64x2_shl(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{wasm_i32x4_shr(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{wasm_i64x2_shr(v.raw, kBits)}; +} + +// 8-bit +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRight(Vec128{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const DFromV di; + const RebindToUnsigned du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ RotateRight (ShiftRight, Or) +template +HWY_API Vec128 RotateRight(const Vec128 v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +} + +// ------------------------------ Shift lanes by same variable #bits + +// After https://reviews.llvm.org/D108415 shift argument became unsigned. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +// Unsigned +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i16x8_shl(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{wasm_u16x8_shr(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i32x4_shl(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{wasm_u32x4_shr(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i64x2_shl(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{wasm_u64x2_shr(v.raw, bits)}; +} + +// Signed +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i16x8_shl(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i16x8_shr(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i32x4_shl(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i32x4_shr(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i64x2_shl(v.raw, bits)}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{wasm_i64x2_shr(v.raw, bits)}; +} + +// 8-bit +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftLeftSame(Vec128>{v.raw}, bits).raw}; + return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); +} + +template +HWY_API Vec128 ShiftRightSame(Vec128 v, + const int bits) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRightSame(Vec128{v.raw}, bits).raw}; + return shifted & Set(d8, 0xFF >> bits); +} + +template +HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { + const DFromV di; + const RebindToUnsigned du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ignore Wsign-conversion +HWY_DIAGNOSTICS(pop) + +// ------------------------------ Minimum + +// Unsigned +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + return Vec128{wasm_u8x16_min(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + return Vec128{wasm_u16x8_min(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + return Vec128{wasm_u32x4_min(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. + const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); + const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); + const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); + const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); + alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)}; + return Vec128{wasm_v128_load(min)}; +} + +// Signed +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + return Vec128{wasm_i8x16_min(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + return Vec128{wasm_i16x8_min(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + return Vec128{wasm_i32x4_min(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + alignas(16) int64_t min[4]; + min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0), + wasm_i64x2_extract_lane(b.raw, 0)); + min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1), + wasm_i64x2_extract_lane(b.raw, 1)); + return Vec128{wasm_v128_load(min)}; +} + +// Float +template +HWY_API Vec128 Min(Vec128 a, Vec128 b) { + return Vec128{wasm_f32x4_min(a.raw, b.raw)}; +} + +// ------------------------------ Maximum + +// Unsigned +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + return Vec128{wasm_u8x16_max(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + return Vec128{wasm_u16x8_max(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + return Vec128{wasm_u32x4_max(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. + const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); + const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); + const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); + const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); + alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)}; + return Vec128{wasm_v128_load(max)}; +} + +// Signed +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + return Vec128{wasm_i8x16_max(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + return Vec128{wasm_i16x8_max(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + return Vec128{wasm_i32x4_max(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + alignas(16) int64_t max[2]; + max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0), + wasm_i64x2_extract_lane(b.raw, 0)); + max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1), + wasm_i64x2_extract_lane(b.raw, 1)); + return Vec128{wasm_v128_load(max)}; +} + +// Float +template +HWY_API Vec128 Max(Vec128 a, Vec128 b) { + return Vec128{wasm_f32x4_max(a.raw, b.raw)}; +} + +// ------------------------------ Integer multiplication + +// Unsigned +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; +} + +// Returns the upper 16 bits of a * b in each lane. +template +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto al = wasm_u32x4_extend_low_u16x8(a.raw); + const auto ah = wasm_u32x4_extend_high_u16x8(a.raw); + const auto bl = wasm_u32x4_extend_low_u16x8(b.raw); + const auto bh = wasm_u32x4_extend_high_u16x8(b.raw); + const auto l = wasm_i32x4_mul(al, bl); + const auto h = wasm_i32x4_mul(ah, bh); + // TODO(eustas): shift-right + narrow? + return Vec128{ + wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; +} +template +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto al = wasm_i32x4_extend_low_i16x8(a.raw); + const auto ah = wasm_i32x4_extend_high_i16x8(a.raw); + const auto bl = wasm_i32x4_extend_low_i16x8(b.raw); + const auto bh = wasm_i32x4_extend_high_i16x8(b.raw); + const auto l = wasm_i32x4_mul(al, bl); + const auto h = wasm_i32x4_mul(ah, bh); + // TODO(eustas): shift-right + narrow? + return Vec128{ + wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; +} + +template +HWY_API Vec128 MulFixedPoint15(Vec128 a, + Vec128 b) { + const DFromV d; + const RebindToUnsigned du; + + const Vec128 lo = BitCast(du, Mul(a, b)); + const Vec128 hi = MulHigh(a, b); + // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must + // carry that into the result. Instead isolate the top two bits because only + // they can influence the result. + const Vec128 lo_top2 = ShiftRight<14>(lo); + // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0. + const Vec128 rounding = ShiftRight<1>(Add(lo_top2, Set(du, 1))); + return Add(Add(hi, hi), BitCast(d, rounding)); +} + +// Multiplies even lanes (0, 2 ..) and returns the double-width result. +template +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); + const auto ae = wasm_v128_and(a.raw, kEvenMask); + const auto be = wasm_v128_and(b.raw, kEvenMask); + return Vec128{wasm_i64x2_mul(ae, be)}; +} +template +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); + const auto ae = wasm_v128_and(a.raw, kEvenMask); + const auto be = wasm_v128_and(b.raw, kEvenMask); + return Vec128{wasm_i64x2_mul(ae, be)}; +} + +// ------------------------------ Negate + +template +HWY_API Vec128 Neg(const Vec128 v) { + return Xor(v, SignBit(DFromV())); +} + +template +HWY_API Vec128 Neg(const Vec128 v) { + return Vec128{wasm_i8x16_neg(v.raw)}; +} +template +HWY_API Vec128 Neg(const Vec128 v) { + return Vec128{wasm_i16x8_neg(v.raw)}; +} +template +HWY_API Vec128 Neg(const Vec128 v) { + return Vec128{wasm_i32x4_neg(v.raw)}; +} +template +HWY_API Vec128 Neg(const Vec128 v) { + return Vec128{wasm_i64x2_neg(v.raw)}; +} + +// ------------------------------ Floating-point mul / div + +template +HWY_API Vec128 operator*(Vec128 a, Vec128 b) { + return Vec128{wasm_f32x4_mul(a.raw, b.raw)}; +} + +template +HWY_API Vec128 operator/(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_f32x4_div(a.raw, b.raw)}; +} + +// Approximate reciprocal +template +HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { + const Vec128 one = Vec128{wasm_f32x4_splat(1.0f)}; + return one / v; +} + +// Absolute value of difference. +template +HWY_API Vec128 AbsDiff(const Vec128 a, + const Vec128 b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +template +HWY_API Vec128 MulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + // TODO(eustas): replace, when implemented in WASM. + // TODO(eustas): is it wasm_f32x4_qfma? + return mul * x + add; +} + +// Returns add - mul * x +template +HWY_API Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + // TODO(eustas): replace, when implemented in WASM. + return add - mul * x; +} + +// Returns mul * x - sub +template +HWY_API Vec128 MulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + // TODO(eustas): replace, when implemented in WASM. + // TODO(eustas): is it wasm_f32x4_qfms? + return mul * x - sub; +} + +// Returns -mul * x - sub +template +HWY_API Vec128 NegMulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + // TODO(eustas): replace, when implemented in WASM. + return Neg(mul) * x - sub; +} + +// ------------------------------ Floating-point square root + +// Full precision square root +template +HWY_API Vec128 Sqrt(const Vec128 v) { + return Vec128{wasm_f32x4_sqrt(v.raw)}; +} + +// Approximate reciprocal square root +template +HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { + // TODO(eustas): find cheaper a way to calculate this. + const Vec128 one = Vec128{wasm_f32x4_splat(1.0f)}; + return one / Sqrt(v); +} + +// ------------------------------ Floating-point rounding + +// Toward nearest integer, ties to even +template +HWY_API Vec128 Round(const Vec128 v) { + return Vec128{wasm_f32x4_nearest(v.raw)}; +} + +// Toward zero, aka truncate +template +HWY_API Vec128 Trunc(const Vec128 v) { + return Vec128{wasm_f32x4_trunc(v.raw)}; +} + +// Toward +infinity, aka ceiling +template +HWY_API Vec128 Ceil(const Vec128 v) { + return Vec128{wasm_f32x4_ceil(v.raw)}; +} + +// Toward -infinity, aka floor +template +HWY_API Vec128 Floor(const Vec128 v) { + return Vec128{wasm_f32x4_floor(v.raw)}; +} + +// ------------------------------ Floating-point classification +template +HWY_API Mask128 IsNaN(const Vec128 v) { + return v != v; +} + +template +HWY_API Mask128 IsInf(const Vec128 v) { + const Simd d; + const RebindToSigned di; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); +} + +// Returns whether normal/subnormal/zero. +template +HWY_API Mask128 IsFinite(const Vec128 v) { + const Simd d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + const VFromD vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD exp = + BitCast(di, ShiftRight() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); +} + +// ================================================== COMPARE + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template +HWY_API Mask128 RebindMask(Simd /*tag*/, + Mask128 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask128{m.raw}; +} + +template +HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +// Unsigned +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(Vec128 a, + Vec128 b) { + return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; +} + +// Float +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_f32x4_eq(a.raw, b.raw)}; +} + +// ------------------------------ Inequality + +// Unsigned +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; +} + +// Float +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_f32x4_ne(a.raw, b.raw)}; +} + +// ------------------------------ Strict inequality + +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i8x16_gt(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i16x8_gt(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i32x4_gt(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_i64x2_gt(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_u8x16_gt(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_u16x8_gt(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_u32x4_gt(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + const DFromV d; + const Repartition d32; + const auto a32 = BitCast(d32, a); + const auto b32 = BitCast(d32, b); + // If the upper halves are not equal, this is the answer. + const auto m_gt = a32 > b32; + + // Otherwise, the lower half decides. + const auto m_eq = a32 == b32; + const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2); + const auto lo_gt = And(m_eq, MaskFromVec(VFromD{lo_in_hi})); + + const auto gt = Or(lo_gt, m_gt); + // Copy result in upper 32 bits to lower 32 bits. + return Mask128{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)}; +} + +template +HWY_API Mask128 operator>(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_f32x4_gt(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { + return operator>(b, a); +} + +// ------------------------------ Weak inequality + +// Float <= >= +template +HWY_API Mask128 operator<=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_f32x4_le(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>=(const Vec128 a, + const Vec128 b) { + return Mask128{wasm_f32x4_ge(a.raw, b.raw)}; +} + +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask128 FirstN(const Simd d, size_t num) { + const RebindToSigned di; // Signed comparisons may be cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); +} + +// ================================================== LOGICAL + +// ------------------------------ Not + +template +HWY_API Vec128 Not(Vec128 v) { + return Vec128{wasm_v128_not(v.raw)}; +} + +// ------------------------------ And + +template +HWY_API Vec128 And(Vec128 a, Vec128 b) { + return Vec128{wasm_v128_and(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template +HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { + return Vec128{wasm_v128_andnot(mask.raw, not_mask.raw)}; +} + +// ------------------------------ Or + +template +HWY_API Vec128 Or(Vec128 a, Vec128 b) { + return Vec128{wasm_v128_or(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template +HWY_API Vec128 Xor(Vec128 a, Vec128 b) { + return Vec128{wasm_v128_xor(a.raw, b.raw)}; +} + +// ------------------------------ Or3 + +template +HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd + +template +HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse + +template +HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, + Vec128 no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template +HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { + return And(a, b); +} + +template +HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { + return Or(a, b); +} + +template +HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { + return Xor(a, b); +} + +// ------------------------------ CopySign + +template +HWY_API Vec128 CopySign(const Vec128 magn, + const Vec128 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + const auto msb = SignBit(DFromV()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template +HWY_API Vec128 CopySignToAbs(const Vec128 abs, + const Vec128 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(DFromV()), sign)); +} + +// ------------------------------ BroadcastSignBit (compare) + +template +HWY_API Vec128 BroadcastSignBit(const Vec128 v) { + return ShiftRight(v); +} +template +HWY_API Vec128 BroadcastSignBit(const Vec128 v) { + const DFromV d; + return VecFromMask(d, v < Zero(d)); +} + +// ------------------------------ Mask + +// Mask and Vec are the same (true = FF..FF). +template +HWY_API Mask128 MaskFromVec(const Vec128 v) { + return Mask128{v.raw}; +} + +template +HWY_API Vec128 VecFromMask(Simd /* tag */, Mask128 v) { + return Vec128{v.raw}; +} + +// mask ? yes : no +template +HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, + Vec128 no) { + return Vec128{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; +} + +// mask ? yes : 0 +template +HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { + return yes & VecFromMask(DFromV(), mask); +} + +// mask ? 0 : no +template +HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { + return AndNot(VecFromMask(DFromV(), mask), no); +} + +template +HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, + Vec128 no) { + static_assert(IsSigned(), "Only works for signed/float"); + const DFromV d; + const RebindToSigned di; + + v = BitCast(d, BroadcastSignBit(BitCast(di, v))); + return IfThenElse(MaskFromVec(v), yes, no); +} + +template +HWY_API Vec128 ZeroIfNegative(Vec128 v) { + const DFromV d; + const auto zero = Zero(d); + return IfThenElse(Mask128{(v > zero).raw}, v, zero); +} + +// ------------------------------ Mask logical + +template +HWY_API Mask128 Not(const Mask128 m) { + return MaskFromVec(Not(VecFromMask(Simd(), m))); +} + +template +HWY_API Mask128 And(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Or(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ------------------------------ Shl (BroadcastSignBit, IfThenElse) + +// The x86 multiply-by-Pow2() trick will not work because WASM saturates +// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a +// scalar count operand, per-lane shift instructions would require extract_lane +// for each lane, and hoping that shuffle is correctly mapped to a native +// instruction. Using non-vector shifts would incur a store-load forwarding +// stall when loading the result vector. We instead test bits of the shift +// count to "predicate" a shift of the entire vector by a constant. + +template +HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { + const DFromV d; + Mask128 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<12>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftLeft<1>(v), v); +} + +template +HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { + const DFromV d; + Mask128 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<27>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<16>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftLeft<1>(v), v); +} + +template +HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { + const DFromV d; + alignas(16) T lanes[2]; + alignas(16) T bits_lanes[2]; + Store(v, d, lanes); + Store(bits, d, bits_lanes); + lanes[0] <<= bits_lanes[0]; + lanes[1] <<= bits_lanes[1]; + return Load(d, lanes); +} + +// ------------------------------ Shr (BroadcastSignBit, IfThenElse) + +template +HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { + const DFromV d; + Mask128 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<12>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftRight<1>(v), v); +} + +template +HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { + const DFromV d; + Mask128 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<27>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<16>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftRight<1>(v), v); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template +HWY_API Vec128 Load(Full128 /* tag */, const T* HWY_RESTRICT aligned) { + return Vec128{wasm_v128_load(aligned)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, + const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +// Partial load. +template +HWY_API Vec128 Load(Simd /* tag */, const T* HWY_RESTRICT p) { + Vec128 v; + CopyBytes(p, &v); + return v; +} + +// LoadU == Load. +template +HWY_API Vec128 LoadU(Simd d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template +HWY_API Vec128 LoadDup128(Simd d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// ------------------------------ Store + +template +HWY_API void Store(Vec128 v, Full128 /* tag */, T* HWY_RESTRICT aligned) { + wasm_v128_store(aligned, v.raw); +} + +// Partial store. +template +HWY_API void Store(Vec128 v, Simd /* tag */, T* HWY_RESTRICT p) { + CopyBytes(&v, p); +} + +HWY_API void Store(const Vec128 v, Simd /* tag */, + float* HWY_RESTRICT p) { + *p = wasm_f32x4_extract_lane(v.raw, 0); +} + +// StoreU == Store. +template +HWY_API void StoreU(Vec128 v, Simd d, T* HWY_RESTRICT p) { + Store(v, d, p); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, Simd d, + T* HWY_RESTRICT p) { + StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); +} + +// ------------------------------ Non-temporal stores + +// Same as aligned stores on non-x86. + +template +HWY_API void Stream(Vec128 v, Simd /* tag */, + T* HWY_RESTRICT aligned) { + wasm_v128_store(aligned, v.raw); +} + +// ------------------------------ Scatter (Store) + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, + T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Rebind(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Index index_lanes[N]; + Store(index, Rebind(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +// ------------------------------ Gather (Load/Store) + +template +HWY_API Vec128 GatherOffset(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Rebind(), offset_lanes); + + alignas(16) T lanes[N]; + const uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template +HWY_API Vec128 GatherIndex(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) Index index_lanes[N]; + Store(index, Rebind(), index_lanes); + + alignas(16) T lanes[N]; + for (size_t i = 0; i < N; ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +// ================================================== SWIZZLE + +// ------------------------------ ExtractLane + +namespace detail { + +template +HWY_INLINE T ExtractLane(const Vec128 v) { + return static_cast(wasm_i8x16_extract_lane(v.raw, kLane)); +} +template +HWY_INLINE T ExtractLane(const Vec128 v) { + return static_cast(wasm_i16x8_extract_lane(v.raw, kLane)); +} +template +HWY_INLINE T ExtractLane(const Vec128 v) { + return static_cast(wasm_i32x4_extract_lane(v.raw, kLane)); +} +template +HWY_INLINE T ExtractLane(const Vec128 v) { + return static_cast(wasm_i64x2_extract_lane(v.raw, kLane)); +} + +template +HWY_INLINE float ExtractLane(const Vec128 v) { + return wasm_f32x4_extract_lane(v.raw, kLane); +} + +} // namespace detail + +// One overload per vector length just in case *_extract_lane raise compile +// errors if their argument is out of bounds (even if that would never be +// reached at runtime). +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return GetLane(v); +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + } + } +#endif + alignas(16) T lanes[2]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + } + } +#endif + alignas(16) T lanes[4]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + } + } +#endif + alignas(16) T lanes[8]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + case 8: + return detail::ExtractLane<8>(v); + case 9: + return detail::ExtractLane<9>(v); + case 10: + return detail::ExtractLane<10>(v); + case 11: + return detail::ExtractLane<11>(v); + case 12: + return detail::ExtractLane<12>(v); + case 13: + return detail::ExtractLane<13>(v); + case 14: + return detail::ExtractLane<14>(v); + case 15: + return detail::ExtractLane<15>(v); + } + } +#endif + alignas(16) T lanes[16]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +// ------------------------------ GetLane +template +HWY_API T GetLane(const Vec128 v) { + return detail::ExtractLane<0>(v); +} + +// ------------------------------ InsertLane + +namespace detail { + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128{ + wasm_i8x16_replace_lane(v.raw, kLane, static_cast(t))}; +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128{ + wasm_i16x8_replace_lane(v.raw, kLane, static_cast(t))}; +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128{ + wasm_i32x4_replace_lane(v.raw, kLane, static_cast(t))}; +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128{ + wasm_i64x2_replace_lane(v.raw, kLane, static_cast(t))}; +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128{wasm_f32x4_replace_lane(v.raw, kLane, t)}; +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { + static_assert(kLane < 2, "Lane index out of bounds"); + return Vec128{wasm_f64x2_replace_lane(v.raw, kLane, t)}; +} + +} // namespace detail + +// Requires one overload per vector length because InsertLane<3> may be a +// compile error if it calls wasm_f64x2_replace_lane. + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + return Set(DFromV(), t); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[2]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[4]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[8]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + case 8: + return detail::InsertLane<8>(v, t); + case 9: + return detail::InsertLane<9>(v, t); + case 10: + return detail::InsertLane<10>(v, t); + case 11: + return detail::InsertLane<11>(v, t); + case 12: + return detail::InsertLane<12>(v, t); + case 13: + return detail::InsertLane<13>(v, t); + case 14: + return detail::InsertLane<14>(v, t); + case 15: + return detail::InsertLane<15>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[16]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ------------------------------ LowerHalf + +template +HWY_API Vec128 LowerHalf(Simd /* tag */, + Vec128 v) { + return Vec128{v.raw}; +} + +template +HWY_API Vec128 LowerHalf(Vec128 v) { + return LowerHalf(Simd(), v); +} + +// ------------------------------ ShiftLeftBytes + +// 0x01..0F, kBytes = 1 => 0x02..0F00 +template +HWY_API Vec128 ShiftLeftBytes(Simd /* tag */, Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const __i8x16 zero = wasm_i8x16_splat(0); + switch (kBytes) { + case 0: + return v; + + case 1: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14)}; + + case 2: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13)}; + + case 3: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)}; + + case 4: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)}; + + case 5: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}; + + case 6: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; + + case 7: + return Vec128{wasm_i8x16_shuffle( + v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; + + case 8: + return Vec128{wasm_i8x16_shuffle( + v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; + + case 9: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, + 6)}; + + case 10: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, + 5)}; + + case 11: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, + 4)}; + + case 12: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 0, 1, + 2, 3)}; + + case 13: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 0, + 1, 2)}; + + case 14: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 0, 1)}; + + case 15: + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 0)}; + } + return Vec128{zero}; +} + +template +HWY_API Vec128 ShiftLeftBytes(Vec128 v) { + return ShiftLeftBytes(Simd(), v); +} + +// ------------------------------ ShiftLeftLanes + +template +HWY_API Vec128 ShiftLeftLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); +} + +template +HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { + return ShiftLeftLanes(DFromV(), v); +} + +// ------------------------------ ShiftRightBytes +namespace detail { + +// Helper function allows zeroing invalid lanes in caller. +template +HWY_API __i8x16 ShrBytes(const Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const __i8x16 zero = wasm_i8x16_splat(0); + + switch (kBytes) { + case 0: + return v.raw; + + case 1: + return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16); + + case 2: + return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16); + + case 3: + return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16, 16); + + case 4: + return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 16, 16, 16); + + case 5: + return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 16, 16, 16, 16); + + case 6: + return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 16, 16, 16, 16, 16); + + case 7: + return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 16, 16, 16, 16, 16, 16); + + case 8: + return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 9: + return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 10: + return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 11: + return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 12: + return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 13: + return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 14: + return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 15: + return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + case 16: + return zero; + } +} + +} // namespace detail + +// 0x01..0F, kBytes = 1 => 0x0001..0E +template +HWY_API Vec128 ShiftRightBytes(Simd /* tag */, Vec128 v) { + // For partial vectors, clear upper lanes so we shift in zeros. + if (N != 16 / sizeof(T)) { + const Vec128 vfull{v.raw}; + v = Vec128{IfThenElseZero(FirstN(Full128(), N), vfull).raw}; + } + return Vec128{detail::ShrBytes(v)}; +} + +// ------------------------------ ShiftRightLanes +template +HWY_API Vec128 ShiftRightLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +// Full input: copy hi into lo (smaller instruction encoding than shifts). +template +HWY_API Vec64 UpperHalf(Full64 /* tag */, const Vec128 v) { + return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, const Vec128 v) { + return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; +} + +// Partial +template +HWY_API Vec128 UpperHalf(Half> /* tag */, + Vec128 v) { + const DFromV d; + const RebindToUnsigned du; + const auto vu = BitCast(du, v); + const auto upper = BitCast(d, ShiftRightBytes(du, vu)); + return Vec128{upper.raw}; +} + +// ------------------------------ CombineShiftRightBytes + +template > +HWY_API V CombineShiftRightBytes(Full128 /* tag */, V hi, V lo) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + switch (kBytes) { + case 0: + return lo; + + case 1: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16)}; + + case 2: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17)}; + + case 3: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18)}; + + case 4: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19)}; + + case 5: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20)}; + + case 6: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21)}; + + case 7: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22)}; + + case 8: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23)}; + + case 9: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24)}; + + case 10: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25)}; + + case 11: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26)}; + + case 12: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27)}; + + case 13: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28)}; + + case 14: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29)}; + + case 15: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30)}; + } + return hi; +} + +template > +HWY_API V CombineShiftRightBytes(Simd d, V hi, V lo) { + constexpr size_t kSize = N * sizeof(T); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + const Repartition d8; + const Full128 d_full8; + using V8 = VFromD; + const V8 hi8{BitCast(d8, hi).raw}; + // Move into most-significant bytes + const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); + const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8); + return V{BitCast(Full128(), r).raw}; +} + +// ------------------------------ Broadcast/splat any lane + +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, + kLane, kLane, kLane, kLane, kLane)}; +} + +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{ + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; +} + +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)}; +} + +// ------------------------------ TableLookupBytes + +// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. +// lane indices in [0, 16). +template +HWY_API Vec128 TableLookupBytes(const Vec128 bytes, + const Vec128 from) { +// Not yet available in all engines, see +// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md +// V8 implementation of this had a bug, fixed on 2021-04-03: +// https://chromium-review.googlesource.com/c/v8/v8/+/2822951 +#if 0 + return Vec128{wasm_i8x16_swizzle(bytes.raw, from.raw)}; +#else + alignas(16) uint8_t control[16]; + alignas(16) uint8_t input[16]; + alignas(16) uint8_t output[16]; + wasm_v128_store(control, from.raw); + wasm_v128_store(input, bytes.raw); + for (size_t i = 0; i < 16; ++i) { + output[i] = control[i] < 16 ? input[control[i]] : 0; + } + return Vec128{wasm_v128_load(output)}; +#endif +} + +template +HWY_API Vec128 TableLookupBytesOr0(const Vec128 bytes, + const Vec128 from) { + const Simd d; + // Mask size must match vector type, so cast everything to this type. + Repartition di8; + Repartition> d_bytes8; + const auto msb = BitCast(di8, from) < Zero(di8); + const auto lookup = + TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); + return BitCast(d, IfThenZeroElse(msb, lookup)); +} + +// ------------------------------ Hard-coded shuffles + +// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template +HWY_API Vec128 Shuffle2301(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; +} + +// These are used by generic_ops-inl to implement LoadInterleaved3. +namespace detail { + +template +HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; +} +template +HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; +} +template +HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)}; +} + +template +HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; +} +template +HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; +} +template +HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)}; +} + +template +HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, + 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; +} +template +HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; +} +template +HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)}; +} + +} // namespace detail + +// Swap 64-bit halves +template +HWY_API Vec128 Shuffle01(const Vec128 v) { + static_assert(sizeof(T) == 8, "Only for 64-bit lanes"); + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; +} +template +HWY_API Vec128 Shuffle1032(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; +} + +// Rotate right 32 bits +template +HWY_API Vec128 Shuffle0321(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; +} + +// Rotate left 32 bits +template +HWY_API Vec128 Shuffle2103(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; +} + +// Reverse +template +HWY_API Vec128 Shuffle0123(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template +struct Indices128 { + __v128_u raw; +}; + +template +HWY_API Indices128 IndicesFromVec(Simd d, Vec128 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast(N))))); +#endif + + const Repartition d8; + using V8 = VFromD; + const Repartition d16; + + // Broadcast each lane index to all bytes of T and shift to bytes + static_assert(sizeof(T) == 4 || sizeof(T) == 8, ""); + if (sizeof(T) == 4) { + alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + const V8 lane_indices = + TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); + const V8 byte_indices = + BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); + alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3}; + return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; + } else { + alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; + const V8 lane_indices = + TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); + const V8 byte_indices = + BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices))); + alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7}; + return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; + } +} + +template +HWY_API Indices128 SetTableIndices(Simd d, const TI* idx) { + const Rebind di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template +HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { + using TI = MakeSigned; + const DFromV d; + const Rebind di; + return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); +} + +// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) + +// Single lane: no change +template +HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { + return v; +} + +// Two lanes: shuffle +template +HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { + return Vec128{Shuffle2301(Vec128{v.raw}).raw}; +} + +template +HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { + return Shuffle01(v); +} + +// Four lanes: shuffle +template +HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { + return Shuffle0123(v); +} + +// 16-bit +template +HWY_API Vec128 Reverse(Simd d, const Vec128 v) { + const RepartitionToWide> du32; + return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); +} + +// ------------------------------ Reverse2 + +template +HWY_API Vec128 Reverse2(Simd d, const Vec128 v) { + const RepartitionToWide> du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +} + +template +HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { + return Shuffle2301(v); +} + +template +HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 + +template +HWY_API Vec128 Reverse4(Simd d, const Vec128 v) { + return BitCast(d, Vec128{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, + 1, 0, 7, 6, 5, 4)}); +} + +template +HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128 v) { + return Shuffle0123(v); +} + +template +HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128) { + HWY_ASSERT(0); // don't have 8 u64 lanes +} + +// ------------------------------ Reverse8 + +template +HWY_API Vec128 Reverse8(Simd d, const Vec128 v) { + return Reverse(d, v); +} + +template +HWY_API Vec128 Reverse8(Simd, const Vec128) { + HWY_ASSERT(0); // don't have 8 lanes unless 16-bit +} + +// ------------------------------ InterleaveLower + +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i8x16_shuffle( + a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; +} +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{ + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; +} +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i8x16_shuffle( + a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; +} +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{ + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; +} +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} + +template +HWY_API Vec128 InterleaveLower(Vec128 a, + Vec128 b) { + return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +// Additional overload for the optional tag. +template +HWY_API V InterleaveLower(DFromV /* tag */, V a, V b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// All functions inside detail lack the required D parameter. +namespace detail { + +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, + 26, 11, 27, 12, 28, 13, 29, 14, + 30, 15, 31)}; +} +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{ + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; +} +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, + 26, 11, 27, 12, 28, 13, 29, 14, + 30, 15, 31)}; +} +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{ + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; +} +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} + +template +HWY_API Vec128 InterleaveUpper(Vec128 a, + Vec128 b) { + return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +} // namespace detail + +// Full +template > +HWY_API V InterleaveUpper(Full128 /* tag */, V a, V b) { + return detail::InterleaveUpper(a, b); +} + +// Partial +template > +HWY_API V InterleaveUpper(Simd d, V a, V b) { + const Half d2; + return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw}); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template >> +HWY_API VFromD ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template , class DW = RepartitionToWide> +HWY_API VFromD ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template , class DW = RepartitionToWide> +HWY_API VFromD ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) + +// N = N/2 + N/2 (upper half undefined) +template +HWY_API Vec128 Combine(Simd d, Vec128 hi_half, + Vec128 lo_half) { + const Half d2; + const RebindToUnsigned du2; + // Treat half-width input as one lane, and expand to two lanes. + using VU = Vec128, 2>; + const VU lo{BitCast(du2, lo_half).raw}; + const VU hi{BitCast(du2, hi_half).raw}; + return BitCast(d, InterleaveLower(lo, hi)); +} + +// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) + +template +HWY_API Vec128 ZeroExtendVector(Simd d, Vec128 lo) { + return IfThenElseZero(FirstN(d, N / 2), Vec128{lo.raw}); +} + +// ------------------------------ ConcatLowerLower + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template +HWY_API Vec128 ConcatLowerLower(Full128 /* tag */, const Vec128 hi, + const Vec128 lo) { + return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; +} +template +HWY_API Vec128 ConcatLowerLower(Simd d, const Vec128 hi, + const Vec128 lo) { + const Half d2; + return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); +} + +// ------------------------------ ConcatUpperUpper + +template +HWY_API Vec128 ConcatUpperUpper(Full128 /* tag */, const Vec128 hi, + const Vec128 lo) { + return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; +} +template +HWY_API Vec128 ConcatUpperUpper(Simd d, const Vec128 hi, + const Vec128 lo) { + const Half d2; + return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); +} + +// ------------------------------ ConcatLowerUpper + +template +HWY_API Vec128 ConcatLowerUpper(Full128 d, const Vec128 hi, + const Vec128 lo) { + return CombineShiftRightBytes<8>(d, hi, lo); +} +template +HWY_API Vec128 ConcatLowerUpper(Simd d, const Vec128 hi, + const Vec128 lo) { + const Half d2; + return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); +} + +// ------------------------------ ConcatUpperLower +template +HWY_API Vec128 ConcatUpperLower(Simd d, const Vec128 hi, + const Vec128 lo) { + return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); +} + +// ------------------------------ ConcatOdd + +// 8-bit full +template +HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31)}; +} + +// 8-bit x8 +template +HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, + Vec128 lo) { + // Don't care about upper half. + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21, + 23, 1, 3, 5, 7, 17, 19, 21, 23)}; +} + +// 8-bit x4 +template +HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, + Vec128 lo) { + // Don't care about upper 3/4. + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17, + 19, 1, 3, 17, 19, 1, 3, 17, 19)}; +} + +// 16-bit full +template +HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { + return Vec128{ + wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)}; +} + +// 16-bit x4 +template +HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, + Vec128 lo) { + // Don't care about upper half. + return Vec128{ + wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)}; +} + +// 32-bit full +template +HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { + return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; +} + +// Any T x2 +template +HWY_API Vec128 ConcatOdd(Simd d, Vec128 hi, + Vec128 lo) { + return InterleaveUpper(d, lo, hi); +} + +// ------------------------------ ConcatEven (InterleaveLower) + +// 8-bit full +template +HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30)}; +} + +// 8-bit x8 +template +HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, + Vec128 lo) { + // Don't care about upper half. + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, + 22, 0, 2, 4, 6, 16, 18, 20, 22)}; +} + +// 8-bit x4 +template +HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, + Vec128 lo) { + // Don't care about upper 3/4. + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, + 18, 0, 2, 16, 18, 0, 2, 16, 18)}; +} + +// 16-bit full +template +HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { + return Vec128{ + wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)}; +} + +// 16-bit x4 +template +HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, + Vec128 lo) { + // Don't care about upper half. + return Vec128{ + wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)}; +} + +// 32-bit full +template +HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { + return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; +} + +// Any T x2 +template +HWY_API Vec128 ConcatEven(Simd d, Vec128 hi, + Vec128 lo) { + return InterleaveLower(d, lo, hi); +} + +// ------------------------------ DupEven (InterleaveLower) + +template +HWY_API Vec128 DupEven(Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)}; +} + +template +HWY_API Vec128 DupEven(const Vec128 v) { + return InterleaveLower(DFromV(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template +HWY_API Vec128 DupOdd(Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)}; +} + +template +HWY_API Vec128 DupOdd(const Vec128 v) { + return InterleaveUpper(DFromV(), v, v); +} + +// ------------------------------ OddEven + +namespace detail { + +template +HWY_INLINE Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, + const Vec128 b) { + const DFromV d; + const Repartition d8; + alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); +} +template +HWY_INLINE Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, + const Vec128 b) { + return Vec128{ + wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; +} +template +HWY_INLINE Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; +} +template +HWY_INLINE Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; +} + +} // namespace detail + +template +HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { + return detail::OddEven(hwy::SizeTag(), a, b); +} +template +HWY_API Vec128 OddEven(const Vec128 a, + const Vec128 b) { + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; +} + +// ------------------------------ OddEvenBlocks +template +HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { + return v; +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template +HWY_API Vec128 ReverseBlocks(Full128 /* tag */, const Vec128 v) { + return v; +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend. +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{ + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{ + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_u64x2_extend_low_u32x4(v.raw)}; +} + +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; +} + +// Signed: replicate sign bit. +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_i16x8_extend_low_i8x16(v.raw)}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{ + wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_i32x4_extend_low_i16x8(v.raw)}; +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_i64x2_extend_low_i32x4(v.raw)}; +} + +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_f64x2_convert_low_i32x4(v.raw)}; +} + +template +HWY_API Vec128 PromoteTo(Simd df32, + const Vec128 v) { + const RebindToSigned di32; + const RebindToUnsigned du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, Vec128{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +} + +template +HWY_API Vec128 PromoteTo(Simd df32, + const Vec128 v) { + const Rebind du16; + const RebindToSigned di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return Vec128{ + wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return Vec128{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* di */, + const Vec128 v) { + return Vec128{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd df16, + const Vec128 v) { + const RebindToUnsigned du16; + const Rebind du; + const RebindToSigned di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return Vec128{DemoteTo(du16, bits16).raw}; +} + +template +HWY_API Vec128 DemoteTo(Simd dbf16, + const Vec128 v) { + const Rebind di32; + const Rebind du32; // for logical shift right + const Rebind du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +template +HWY_API Vec128 ReorderDemote2To( + Simd dbf16, Vec128 a, Vec128 b) { + const RebindToUnsigned du16; + const Repartition du32; + const Vec128 b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes +// above 2*N. +HWY_API Vec128 ReorderDemote2To(Simd dn, + Vec128 a, + Vec128 b) { + const Half dnh; + // Pretend the result has twice as many lanes so we can InterleaveLower. + const Vec128 an{DemoteTo(dnh, a).raw}; + const Vec128 bn{DemoteTo(dnh, b).raw}; + return InterleaveLower(an, bn); +} +HWY_API Vec128 ReorderDemote2To(Simd dn, + Vec128 a, + Vec128 b) { + const Half dnh; + // Pretend the result has twice as many lanes so we can InterleaveLower. + const Vec128 an{DemoteTo(dnh, a).raw}; + const Vec128 bn{DemoteTo(dnh, b).raw}; + return InterleaveLower(an, bn); +} +HWY_API Vec128 ReorderDemote2To(Full128 /*d16*/, + Vec128 a, Vec128 b) { + return Vec128{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; +} + +// For already range-limited input [0, 255]. +template +HWY_API Vec128 U8FromU32(const Vec128 v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return Vec128{ + wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; +} + +// ------------------------------ Truncations + +template * = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + return Vec128{v1.raw}; +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d; + const auto v1 = BitCast(d, v); + const auto v2 = ConcatEven(d, v1, v1); + const auto v4 = ConcatEven(d, v2, v2); + return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4)))); +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d; + const auto v1 = BitCast(d, v); + const auto v2 = ConcatEven(d, v1, v1); + return LowerHalf(LowerHalf(ConcatEven(d, v2, v2))); +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d; + const auto v1 = BitCast(d, v); + return LowerHalf(ConcatEven(d, v1, v1)); +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d; + const auto v1 = Vec128{v.raw}; + const auto v2 = ConcatEven(d, v1, v1); + const auto v3 = ConcatEven(d, v2, v2); + return Vec128{v3.raw}; +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d; + const auto v1 = Vec128{v.raw}; + const auto v2 = ConcatEven(d, v1, v1); + return Vec128{v2.raw}; +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d; + const auto v1 = Vec128{v.raw}; + const auto v2 = ConcatEven(d, v1, v1); + return Vec128{v2.raw}; +} + +// ------------------------------ Convert i32 <=> f32 (Round) + +template +HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_f32x4_convert_i32x4(v.raw)}; +} +template +HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_f32x4_convert_u32x4(v.raw)}; +} +// Truncates (rounds toward zero). +template +HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { + return Vec128{wasm_i32x4_trunc_sat_f32x4(v.raw)}; +} + +template +HWY_API Vec128 NearestInt(const Vec128 v) { + return ConvertTo(Simd(), Round(v)); +} + +// ================================================== MISC + +// ------------------------------ SumsOf8 (ShiftRight, Add) +template +HWY_API Vec128 SumsOf8(const Vec128 v) { + const DFromV du8; + const RepartitionToWide du16; + const RepartitionToWide du32; + const RepartitionToWide du64; + using VU16 = VFromD; + + const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); + const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF)); + const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); + + const VU16 szz_FE_zz_BA_zz_76_zz_32 = + BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); + const VU16 sxx_FC_xx_B8_xx_74_xx_30 = + Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); + const VU16 szz_zz_xx_FC_zz_zz_xx_74 = + BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); + const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = + Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); + return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF)); +} + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { + const RebindToUnsigned du; + // Easier than Set(), which would require an >8-bit type, which would not + // compile for T=uint8_t, N=1. + const Vec128 vbits{wasm_i32x4_splat(static_cast(bits))}; + + // Replicate bytes 8x such that each byte contains the bit that governs it. + alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; + const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); + + alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask( + d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8}; + return RebindMask( + d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint64_t kBit[8] = {1, 2}; + return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask128 LoadMaskBits(Simd d, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + CopyBytes<(N + 7) / 8>(bits, &mask_bits); + return detail::LoadMaskBits(d, mask_bits); +} + +// ------------------------------ Mask + +namespace detail { + +// Full +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, mask.raw); + + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + const uint64_t lo = ((lanes[0] * kMagic) >> 56); + const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; + return (hi + lo); +} + +// 64-bit +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * + kMagic) >> + 56; +} + +// 32-bit or less: need masking +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { + uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); + // Clear potentially undefined bytes. + bytes &= (1ULL << (N * 8)) - 1; + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + return (bytes * kMagic) >> 56; +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128 mask) { + // Remove useless lower half of each u16 while preserving the sign bit. + const __i16x8 zero = wasm_i16x8_splat(0); + const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; + return BitsFromMask(hwy::SizeTag<1>(), mask8); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, + const Mask128 mask) { + const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); + const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); + alignas(16) uint32_t lanes[4]; + wasm_v128_store(lanes, sliced_mask); + return lanes[0] | lanes[1] | lanes[2] | lanes[3]; +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, + const Mask128 mask) { + const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); + const __i64x2 slice = wasm_i64x2_make(1, 2); + const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, sliced_mask); + return lanes[0] | lanes[1]; +} + +// Returns the lowest N bits for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); +} + +// Returns 0xFF for bytes with index >= N, otherwise 0. +template +constexpr __i8x16 BytesAbove() { + return /**/ + (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) + : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) + : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) + : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) + : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) + : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) + : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) + : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) + : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) + : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1) + : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1) + : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1) + : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, + -1, -1, -1) + : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, + -1, -1, -1) + : (N == 11) + ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) + : (N == 13) + ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) + : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); +} + +template +HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { + return PopCount(BitsFromMask(tag, m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { + return PopCount(BitsFromMask(tag, m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { + const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, shifted_bits); + return PopCount(lanes[0] | lanes[1]); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { + alignas(16) int64_t lanes[2]; + wasm_v128_store(lanes, m.raw); + return static_cast(-(lanes[0] + lanes[1])); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(const Simd /* tag */, + const Mask128 mask, uint8_t* bits) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const size_t kNumBytes = (N + 7) / 8; + CopyBytes(&mask_bits, bits); + return kNumBytes; +} + +template +HWY_API size_t CountTrue(const Simd /* tag */, const Mask128 m) { + return detail::CountTrue(hwy::SizeTag(), m); +} + +// Partial vector +template +HWY_API size_t CountTrue(const Simd d, const Mask128 m) { + // Ensure all undefined bytes are 0. + const Mask128 mask{detail::BytesAbove()}; + return CountTrue(d, Mask128{AndNot(mask, m).raw}); +} + +// Full vector +template +HWY_API bool AllFalse(const Full128 d, const Mask128 m) { +#if 0 + // Casting followed by wasm_i8x16_any_true results in wasm error: + // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128 + const auto v8 = BitCast(Full128(), VecFromMask(d, m)); + return !wasm_i8x16_any_true(v8.raw); +#else + (void)d; + return (wasm_i64x2_extract_lane(m.raw, 0) | + wasm_i64x2_extract_lane(m.raw, 1)) == 0; +#endif +} + +// Full vector +namespace detail { +template +HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { + return wasm_i8x16_all_true(m.raw); +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { + return wasm_i16x8_all_true(m.raw); +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { + return wasm_i32x4_all_true(m.raw); +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { + return wasm_i64x2_all_true(m.raw); +} + +} // namespace detail + +template +HWY_API bool AllTrue(const Simd /* tag */, const Mask128 m) { + return detail::AllTrue(hwy::SizeTag(), m); +} + +// Partial vectors + +template +HWY_API bool AllFalse(Simd /* tag */, const Mask128 m) { + // Ensure all undefined bytes are 0. + const Mask128 mask{detail::BytesAbove()}; + return AllFalse(Full128(), Mask128{AndNot(mask, m).raw}); +} + +template +HWY_API bool AllTrue(const Simd /* d */, const Mask128 m) { + // Ensure all undefined bytes are FF. + const Mask128 mask{detail::BytesAbove()}; + return AllTrue(Full128(), Mask128{Or(mask, m).raw}); +} + +template +HWY_API size_t FindKnownFirstTrue(const Simd /* tag */, + const Mask128 mask) { + const uint64_t bits = detail::BitsFromMask(mask); + return Num0BitsBelowLS1Bit_Nonzero64(bits); +} + +template +HWY_API intptr_t FindFirstTrue(const Simd /* tag */, + const Mask128 mask) { + const uint64_t bits = detail::BitsFromMask(mask); + return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1; +} + +// ------------------------------ Compress + +namespace detail { + +template +HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd d; + const Rebind d8; + const Simd du; + + // We need byte indices for TableLookupBytes (one vector's worth for each of + // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We + // can instead store lane indices and convert to byte indices (2*lane + 0..1), + // with the doubling baked into the table. Unpacking nibbles is likely more + // costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[256 * 8] = { + // PrintCompress16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // + 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // + 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // + 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // + 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // + 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // + 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // + 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // + 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // + 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // + 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // + 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // + 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // + 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // + 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // + 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // + 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // + 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // + 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // + 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // + 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // + 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // + 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // + 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // + 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // + 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // + 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // + 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // + 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // + 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // + 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // + 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // + 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // + 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // + 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // + 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // + 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // + 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // + 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // + 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // + 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // + 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // + 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // + 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // + 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // + 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // + 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // + 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // + 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // + 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // + 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // + 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // + 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // + 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // + 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // + 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // + 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // + 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // + 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // + 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // + 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // + 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // + 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // + 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // + 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // + 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // + 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // + 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // + 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // + 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // + 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // + 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // + 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // + 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // + 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // + 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // + 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // + 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // + 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // + 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // + 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // + 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // + 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // + 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // + 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // + 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // + 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // + 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // + 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // + 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // + 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // + 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // + 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // + 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // + 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // + 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // + 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // + 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // + 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // + 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // + 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // + 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // + 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // + 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // + 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // + 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // + 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // + 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // + 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // + 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // + 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // + 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // + 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // + 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // + 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // + 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // + 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // + 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // + 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // + 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // + 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd d; + const Rebind d8; + const Simd du; + + // We need byte indices for TableLookupBytes (one vector's worth for each of + // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We + // can instead store lane indices and convert to byte indices (2*lane + 0..1), + // with the doubling baked into the table. Unpacking nibbles is likely more + // costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[256 * 8] = { + // PrintCompressNot16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // + 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // + 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // + 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // + 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // + 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // + 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // + 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // + 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // + 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // + 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // + 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // + 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // + 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // + 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // + 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // + 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // + 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // + 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // + 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // + 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // + 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // + 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // + 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // + 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // + 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // + 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // + 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // + 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // + 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // + 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // + 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // + 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // + 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // + 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // + 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // + 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // + 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // + 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // + 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // + 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // + 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // + 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // + 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // + 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // + 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // + 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // + 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // + 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // + 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // + 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // + 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // + 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // + 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // + 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // + 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // + 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // + 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // + 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // + 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // + 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // + 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // + 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // + 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // + 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // + 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // + 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // + 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // + 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // + 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // + 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // + 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // + 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // + 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // + 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // + 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // + 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // + 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // + 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // + 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // + 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // + 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // + 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // + 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // + 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // + 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // + 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // + 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // + 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // + 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // + 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // + 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // + 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // + 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // + 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // + 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // + 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // + 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // + 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // + 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // + 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // + 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // + 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // + 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // + 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // + 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // + 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // + 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // + 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // + 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // + 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // + 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // + 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // + 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // + 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // + 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // + 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // + 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // + 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // + 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // + 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompress32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // + 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // + 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[16 * 16] = { + // PrintCompressNot32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[4 * 16] = { + // PrintCompress64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[4 * 16] = { + // PrintCompressNot64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Simd d; + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +// Helper functions called by both Compress and CompressStore - avoids a +// redundant BitsFromMask in the latter. + +template +HWY_INLINE Vec128 Compress(Vec128 v, const uint64_t mask_bits) { + const auto idx = detail::IdxFromBits(mask_bits); + const DFromV d; + const RebindToSigned di; + return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +template +HWY_INLINE Vec128 CompressNot(Vec128 v, const uint64_t mask_bits) { + const auto idx = detail::IdxFromNotBits(mask_bits); + const DFromV d; + const RebindToSigned di; + return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +} // namespace detail + +template +struct CompressIsPartition { + enum { value = 1 }; +}; + +// Single lane: no-op +template +HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { + return v; +} + +// Two lanes: conditional swap +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. + const Full128 d; + const Vec128 m = VecFromMask(d, mask); + const Vec128 maskL = DupEven(m); + const Vec128 maskH = DupOdd(m); + const Vec128 swap = AndNot(maskL, maskH); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + return detail::Compress(v, detail::BitsFromMask(mask)); +} + +// Single lane: no-op +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { + return v; +} + +// Two lanes: conditional swap +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. + const Full128 d; + const Vec128 m = VecFromMask(d, mask); + const Vec128 maskL = DupEven(m); + const Vec128 maskH = DupOdd(m); + const Vec128 swap = AndNot(maskH, maskL); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + // For partial vectors, we cannot pull the Not() into the table because + // BitsFromMask clears the upper bits. + if (N < 16 / sizeof(T)) { + return detail::Compress(v, detail::BitsFromMask(Not(mask))); + } + return detail::CompressNot(v, detail::BitsFromMask(mask)); +} +// ------------------------------ CompressBlocksNot +HWY_API Vec128 CompressBlocksNot(Vec128 v, + Mask128 /* m */) { + return v; +} + +// ------------------------------ CompressBits + +template +HWY_API Vec128 CompressBits(Vec128 v, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::Compress(v, mask_bits); +} + +// ------------------------------ CompressStore +template +HWY_API size_t CompressStore(Vec128 v, const Mask128 mask, + Simd d, T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const auto c = detail::Compress(v, mask_bits); + StoreU(c, d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(Vec128 v, Mask128 m, + Simd d, + T* HWY_RESTRICT unaligned) { + const RebindToUnsigned du; // so we can support fp16/bf16 + using TU = TFromD; + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + const Vec128 compressed = detail::Compress(BitCast(du, v), mask_bits); + const Mask128 store_mask = RebindMask(d, FirstN(du, count)); + BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); + return count; +} + +// ------------------------------ CompressBitsStore + +template +HWY_API size_t CompressBitsStore(Vec128 v, + const uint8_t* HWY_RESTRICT bits, + Simd d, T* HWY_RESTRICT unaligned) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + const auto c = detail::Compress(v, mask_bits); + StoreU(c, d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ StoreInterleaved2/3/4 + +// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in +// generic_ops-inl.h. + +// ------------------------------ MulEven/Odd (Load) + +HWY_INLINE Vec128 MulEven(const Vec128 a, + const Vec128 b) { + alignas(16) uint64_t mul[2]; + mul[0] = + Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 0)), + static_cast(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); + return Load(Full128(), mul); +} + +HWY_INLINE Vec128 MulOdd(const Vec128 a, + const Vec128 b) { + alignas(16) uint64_t mul[2]; + mul[0] = + Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 1)), + static_cast(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); + return Load(Full128(), mul); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template +HWY_API Vec128 ReorderWidenMulAccumulate(Simd df32, + Vec128 a, + Vec128 b, + const Vec128 sum0, + Vec128& sum1) { + const Repartition du16; + const RebindToUnsigned du32; + const Vec128 zero = Zero(du16); + const Vec128 a0 = ZipLower(du32, zero, BitCast(du16, a)); + const Vec128 a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const Vec128 b0 = ZipLower(du32, zero, BitCast(du16, b)); + const Vec128 b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is +// safe. +template +HWY_API Vec128 ReorderWidenMulAccumulate( + Simd /*d32*/, Vec128 a, + Vec128 b, const Vec128 sum0, + Vec128& /*sum1*/) { + return sum0 + Vec128{wasm_i32x4_dot_i16x8(a.raw, b.raw)}; +} + +// ------------------------------ Reductions + +namespace detail { + +// N=1 for any T: no-op +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} + +// u32/i32/f32: + +// N=2 +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return v10 + Vec128{Shuffle2301(Vec128{v10.raw}).raw}; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Min(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Max(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); +} + +// N=4 (full) +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = v3210 + v1032; + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Min(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Max(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +// u64/i64/f64: + +// N=2 (full) +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return v10 + v01; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Min(v10, v01); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Max(v10, v01); +} + +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} + +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +} // namespace detail + +// Supported for u/i/f 32/64. Returns the same value in each lane. +template +HWY_API Vec128 SumOfLanes(Simd /* tag */, const Vec128 v) { + return detail::SumOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec128 MinOfLanes(Simd /* tag */, const Vec128 v) { + return detail::MinOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec128 MaxOfLanes(Simd /* tag */, const Vec128 v) { + return detail::MaxOfLanes(hwy::SizeTag(), v); +} + +// ------------------------------ Lt128 + +template +HWY_INLINE Mask128 Lt128(Simd d, Vec128 a, + Vec128 b) { + static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); + // Truth table of Eq and Lt for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const Mask128 eqHL = Eq(a, b); + const Vec128 ltHL = VecFromMask(d, Lt(a, b)); + // We need to bring cL to the upper lane/bit corresponding to cH. Comparing + // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the + // comparison result leftwards requires only 4. IfThenElse compiles to the + // same code as OrAnd(). + const Vec128 ltLx = DupEven(ltHL); + const Vec128 outHx = IfThenElse(eqHL, ltLx, ltHL); + return MaskFromVec(DupOdd(outHx)); +} + +template +HWY_INLINE Mask128 Lt128Upper(Simd d, Vec128 a, + Vec128 b) { + const Vec128 ltHL = VecFromMask(d, Lt(a, b)); + return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); +} + +// ------------------------------ Eq128 + +template +HWY_INLINE Mask128 Eq128(Simd d, Vec128 a, + Vec128 b) { + static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); + const Vec128 eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); +} + +template +HWY_INLINE Mask128 Eq128Upper(Simd d, Vec128 a, + Vec128 b) { + const Vec128 eqHL = VecFromMask(d, Eq(a, b)); + return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); +} + +// ------------------------------ Ne128 + +template +HWY_INLINE Mask128 Ne128(Simd d, Vec128 a, + Vec128 b) { + static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); + const Vec128 neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(Or(Reverse2(d, neHL), neHL)); +} + +template +HWY_INLINE Mask128 Ne128Upper(Simd d, Vec128 a, + Vec128 b) { + const Vec128 neHL = VecFromMask(d, Ne(a, b)); + return MaskFromVec(InterleaveUpper(d, neHL, neHL)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +// Without a native OddEven, it seems infeasible to go faster than Lt128. +template +HWY_INLINE VFromD Min128(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128(d, a, b), a, b); +} + +template +HWY_INLINE VFromD Max128(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128(d, b, a), a, b); +} + +template +HWY_INLINE VFromD Min128Upper(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128Upper(d, a, b), a, b); +} + +template +HWY_INLINE VFromD Max128Upper(D d, const VFromD a, const VFromD b) { + return IfThenElse(Lt128Upper(d, b, a), a, b); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h new file mode 100644 index 0000000..42f4fb2 --- /dev/null +++ b/hwy/ops/wasm_256-inl.h @@ -0,0 +1,3060 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 256-bit WASM vectors and operations. Experimental. +// External include guard in highway.h - see comment there. + +#include +#include +#include + +#include "hwy/base.h" +#include "hwy/ops/shared-inl.h" +#include "hwy/ops/wasm_128-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +using Full256 = Simd; + +template +using Full128 = Simd; + +// TODO(richardwinterton): add this to DeduceD in wasm_128 similar to x86_128. +template +class Vec256 { + public: + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec256& operator*=(const Vec256 other) { + return *this = (*this * other); + } + HWY_INLINE Vec256& operator/=(const Vec256 other) { + return *this = (*this / other); + } + HWY_INLINE Vec256& operator+=(const Vec256 other) { + return *this = (*this + other); + } + HWY_INLINE Vec256& operator-=(const Vec256 other) { + return *this = (*this - other); + } + HWY_INLINE Vec256& operator&=(const Vec256 other) { + return *this = (*this & other); + } + HWY_INLINE Vec256& operator|=(const Vec256 other) { + return *this = (*this | other); + } + HWY_INLINE Vec256& operator^=(const Vec256 other) { + return *this = (*this ^ other); + } + + Vec128 v0; + Vec128 v1; +}; + +template +struct Mask256 { + Mask128 m0; + Mask128 m1; +}; + +// ------------------------------ BitCast + +template +HWY_API Vec256 BitCast(Full256 d, Vec256 v) { + const Half dh; + Vec256 ret; + ret.v0 = BitCast(dh, v.v0); + ret.v1 = BitCast(dh, v.v1); + return ret; + + // TODO(richardwinterton): implement other ops like this +} + +// ------------------------------ Zero + +// Returns an all-zero vector/part. +template +HWY_API Vec256 Zero(Full256 /* tag */) { + return Vec256{wasm_i32x4_splat(0)}; +} +HWY_API Vec256 Zero(Full256 /* tag */) { + return Vec256{wasm_f32x4_splat(0.0f)}; +} + +template +using VFromD = decltype(Zero(D())); + +// ------------------------------ Set + +// Returns a vector/part with all lanes set to "t". +HWY_API Vec256 Set(Full256 /* tag */, const uint8_t t) { + return Vec256{wasm_i8x16_splat(static_cast(t))}; +} +HWY_API Vec256 Set(Full256 /* tag */, const uint16_t t) { + return Vec256{wasm_i16x8_splat(static_cast(t))}; +} +HWY_API Vec256 Set(Full256 /* tag */, const uint32_t t) { + return Vec256{wasm_i32x4_splat(static_cast(t))}; +} +HWY_API Vec256 Set(Full256 /* tag */, const uint64_t t) { + return Vec256{wasm_i64x2_splat(static_cast(t))}; +} + +HWY_API Vec256 Set(Full256 /* tag */, const int8_t t) { + return Vec256{wasm_i8x16_splat(t)}; +} +HWY_API Vec256 Set(Full256 /* tag */, const int16_t t) { + return Vec256{wasm_i16x8_splat(t)}; +} +HWY_API Vec256 Set(Full256 /* tag */, const int32_t t) { + return Vec256{wasm_i32x4_splat(t)}; +} +HWY_API Vec256 Set(Full256 /* tag */, const int64_t t) { + return Vec256{wasm_i64x2_splat(t)}; +} + +HWY_API Vec256 Set(Full256 /* tag */, const float t) { + return Vec256{wasm_f32x4_splat(t)}; +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template +HWY_API Vec256 Undefined(Full256 d) { + return Zero(d); +} + +HWY_DIAGNOSTICS(pop) + +// Returns a vector with lane i=[0, N) set to "first" + i. +template +Vec256 Iota(const Full256 d, const T2 first) { + HWY_ALIGN T lanes[16 / sizeof(T)]; + for (size_t i = 0; i < 16 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i8x16_add(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_add(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i32x4_add(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i8x16_add(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_add(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i32x4_add(a.raw, b.raw)}; +} + +// Float +HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { + return Vec256{wasm_f32x4_add(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i8x16_sub(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(Vec256 a, Vec256 b) { + return Vec256{wasm_i16x8_sub(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i32x4_sub(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i8x16_sub(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_sub(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i32x4_sub(a.raw, b.raw)}; +} + +// Float +HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { + return Vec256{wasm_f32x4_sub(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +HWY_API Vec256 SumsOf8(const Vec256 v) { + HWY_ABORT("not implemented"); +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u8x16_add_sat(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u16x8_add_sat(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i8x16_add_sat(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_add_sat(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u8x16_sub_sat(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u16x8_sub_sat(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i8x16_sub_sat(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_sub_sat(a.raw, b.raw)}; +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +// Unsigned +HWY_API Vec256 AverageRound(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u8x16_avgr(a.raw, b.raw)}; +} +HWY_API Vec256 AverageRound(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u16x8_avgr(a.raw, b.raw)}; +} + +// ------------------------------ Absolute value + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{wasm_i8x16_abs(v.raw)}; +} +HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{wasm_i16x8_abs(v.raw)}; +} +HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{wasm_i32x4_abs(v.raw)}; +} +HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{wasm_i62x2_abs(v.raw)}; +} + +HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{wasm_f32x4_abs(v.raw)}; +} + +// ------------------------------ Shift lanes by constant #bits + +// Unsigned +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{wasm_i16x8_shl(v.raw, kBits)}; +} +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{wasm_u16x8_shr(v.raw, kBits)}; +} +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{wasm_i32x4_shl(v.raw, kBits)}; +} +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{wasm_u32x4_shr(v.raw, kBits)}; +} + +// Signed +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{wasm_i16x8_shl(v.raw, kBits)}; +} +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{wasm_i16x8_shr(v.raw, kBits)}; +} +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{wasm_i32x4_shl(v.raw, kBits)}; +} +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{wasm_i32x4_shr(v.raw, kBits)}; +} + +// 8-bit +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + const Full256 d8; + // Use raw instead of BitCast to support N=1. + const Vec256 shifted{ShiftLeft(Vec128>{v.raw}).raw}; + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + const Full256 d8; + // Use raw instead of BitCast to support N=1. + const Vec256 shifted{ShiftRight(Vec128{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + const Full256 di; + const Full256 du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ RotateRight (ShiftRight, Or) +template +HWY_API Vec256 RotateRight(const Vec256 v) { + constexpr size_t kSizeInBits = sizeof(T) * 8; + static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +} + +// ------------------------------ Shift lanes by same variable #bits + +// Unsigned +HWY_API Vec256 ShiftLeftSame(const Vec256 v, + const int bits) { + return Vec256{wasm_i16x8_shl(v.raw, bits)}; +} +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{wasm_u16x8_shr(v.raw, bits)}; +} +HWY_API Vec256 ShiftLeftSame(const Vec256 v, + const int bits) { + return Vec256{wasm_i32x4_shl(v.raw, bits)}; +} +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{wasm_u32x4_shr(v.raw, bits)}; +} + +// Signed +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + return Vec256{wasm_i16x8_shl(v.raw, bits)}; +} +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{wasm_i16x8_shr(v.raw, bits)}; +} +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + return Vec256{wasm_i32x4_shl(v.raw, bits)}; +} +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{wasm_i32x4_shr(v.raw, bits)}; +} + +// 8-bit +template +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + const Full256 d8; + // Use raw instead of BitCast to support N=1. + const Vec256 shifted{ShiftLeftSame(Vec128>{v.raw}, bits).raw}; + return shifted & Set(d8, (0xFF << bits) & 0xFF); +} + +HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { + const Full256 d8; + // Use raw instead of BitCast to support N=1. + const Vec256 shifted{ + ShiftRightSame(Vec128{v.raw}, bits).raw}; + return shifted & Set(d8, 0xFF >> bits); +} + +HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { + const Full256 di; + const Full256 du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ Minimum + +// Unsigned +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{wasm_u8x16_min(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u16x8_min(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u32x4_min(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, + const Vec256 b) { + alignas(32) float min[4]; + min[0] = + HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0)); + min[1] = + HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1)); + return Vec256{wasm_v128_load(min)}; +} + +// Signed +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{wasm_i8x16_min(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{wasm_i16x8_min(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{wasm_i32x4_min(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + alignas(32) float min[4]; + min[0] = + HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0)); + min[1] = + HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1)); + return Vec256{wasm_v128_load(min)}; +} + +// Float +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{wasm_f32x4_min(a.raw, b.raw)}; +} + +// ------------------------------ Maximum + +// Unsigned +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{wasm_u8x16_max(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u16x8_max(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_u32x4_max(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, + const Vec256 b) { + alignas(32) float max[4]; + max[0] = + HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0)); + max[1] = + HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1)); + return Vec256{wasm_v128_load(max)}; +} + +// Signed +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{wasm_i8x16_max(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{wasm_i16x8_max(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{wasm_i32x4_max(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + alignas(32) float max[4]; + max[0] = + HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0)); + max[1] = + HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1)); + return Vec256{wasm_v128_load(max)}; +} + +// Float +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{wasm_f32x4_max(a.raw, b.raw)}; +} + +// ------------------------------ Integer multiplication + +// Unsigned +HWY_API Vec256 operator*(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_mul(a.raw, b.raw)}; +} +HWY_API Vec256 operator*(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i32x4_mul(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 operator*(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_mul(a.raw, b.raw)}; +} +HWY_API Vec256 operator*(const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i32x4_mul(a.raw, b.raw)}; +} + +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec256 MulHigh(const Vec256 a, + const Vec256 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto al = wasm_u32x4_extend_low_u16x8(a.raw); + const auto ah = wasm_u32x4_extend_high_u16x8(a.raw); + const auto bl = wasm_u32x4_extend_low_u16x8(b.raw); + const auto bh = wasm_u32x4_extend_high_u16x8(b.raw); + const auto l = wasm_i32x4_mul(al, bl); + const auto h = wasm_i32x4_mul(ah, bh); + // TODO(eustas): shift-right + narrow? + return Vec256{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; +} +HWY_API Vec256 MulHigh(const Vec256 a, + const Vec256 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto al = wasm_i32x4_extend_low_i16x8(a.raw); + const auto ah = wasm_i32x4_extend_high_i16x8(a.raw); + const auto bl = wasm_i32x4_extend_low_i16x8(b.raw); + const auto bh = wasm_i32x4_extend_high_i16x8(b.raw); + const auto l = wasm_i32x4_mul(al, bl); + const auto h = wasm_i32x4_mul(ah, bh); + // TODO(eustas): shift-right + narrow? + return Vec256{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; +} + +HWY_API Vec256 MulFixedPoint15(Vec256, Vec256) { + HWY_ASSERT(0); // Not implemented +} + +// Multiplies even lanes (0, 2 ..) and returns the double-width result. +HWY_API Vec256 MulEven(const Vec256 a, + const Vec256 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); + const auto ae = wasm_v128_and(a.raw, kEvenMask); + const auto be = wasm_v128_and(b.raw, kEvenMask); + return Vec256{wasm_i64x2_mul(ae, be)}; +} +HWY_API Vec256 MulEven(const Vec256 a, + const Vec256 b) { + // TODO(eustas): replace, when implemented in WASM. + const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); + const auto ae = wasm_v128_and(a.raw, kEvenMask); + const auto be = wasm_v128_and(b.raw, kEvenMask); + return Vec256{wasm_i64x2_mul(ae, be)}; +} + +// ------------------------------ Negate + +template +HWY_API Vec256 Neg(const Vec256 v) { + return Xor(v, SignBit(Full256())); +} + +HWY_API Vec256 Neg(const Vec256 v) { + return Vec256{wasm_i8x16_neg(v.raw)}; +} +HWY_API Vec256 Neg(const Vec256 v) { + return Vec256{wasm_i16x8_neg(v.raw)}; +} +HWY_API Vec256 Neg(const Vec256 v) { + return Vec256{wasm_i32x4_neg(v.raw)}; +} +HWY_API Vec256 Neg(const Vec256 v) { + return Vec256{wasm_i64x2_neg(v.raw)}; +} + +// ------------------------------ Floating-point mul / div + +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{wasm_f32x4_mul(a.raw, b.raw)}; +} + +HWY_API Vec256 operator/(const Vec256 a, const Vec256 b) { + return Vec256{wasm_f32x4_div(a.raw, b.raw)}; +} + +// Approximate reciprocal +HWY_API Vec256 ApproximateReciprocal(const Vec256 v) { + const Vec256 one = Vec256{wasm_f32x4_splat(1.0f)}; + return one / v; +} + +// Absolute value of difference. +HWY_API Vec256 AbsDiff(const Vec256 a, const Vec256 b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +HWY_API Vec256 MulAdd(const Vec256 mul, const Vec256 x, + const Vec256 add) { + // TODO(eustas): replace, when implemented in WASM. + // TODO(eustas): is it wasm_f32x4_qfma? + return mul * x + add; +} + +// Returns add - mul * x +HWY_API Vec256 NegMulAdd(const Vec256 mul, const Vec256 x, + const Vec256 add) { + // TODO(eustas): replace, when implemented in WASM. + return add - mul * x; +} + +// Returns mul * x - sub +HWY_API Vec256 MulSub(const Vec256 mul, const Vec256 x, + const Vec256 sub) { + // TODO(eustas): replace, when implemented in WASM. + // TODO(eustas): is it wasm_f32x4_qfms? + return mul * x - sub; +} + +// Returns -mul * x - sub +HWY_API Vec256 NegMulSub(const Vec256 mul, const Vec256 x, + const Vec256 sub) { + // TODO(eustas): replace, when implemented in WASM. + return Neg(mul) * x - sub; +} + +// ------------------------------ Floating-point square root + +// Full precision square root +HWY_API Vec256 Sqrt(const Vec256 v) { + return Vec256{wasm_f32x4_sqrt(v.raw)}; +} + +// Approximate reciprocal square root +HWY_API Vec256 ApproximateReciprocalSqrt(const Vec256 v) { + // TODO(eustas): find cheaper a way to calculate this. + const Vec256 one = Vec256{wasm_f32x4_splat(1.0f)}; + return one / Sqrt(v); +} + +// ------------------------------ Floating-point rounding + +// Toward nearest integer, ties to even +HWY_API Vec256 Round(const Vec256 v) { + return Vec256{wasm_f32x4_nearest(v.raw)}; +} + +// Toward zero, aka truncate +HWY_API Vec256 Trunc(const Vec256 v) { + return Vec256{wasm_f32x4_trunc(v.raw)}; +} + +// Toward +infinity, aka ceiling +HWY_API Vec256 Ceil(const Vec256 v) { + return Vec256{wasm_f32x4_ceil(v.raw)}; +} + +// Toward -infinity, aka floor +HWY_API Vec256 Floor(const Vec256 v) { + return Vec256{wasm_f32x4_floor(v.raw)}; +} + +// ------------------------------ Floating-point classification + +template +HWY_API Mask256 IsNaN(const Vec256 v) { + return v != v; +} + +template +HWY_API Mask256 IsInf(const Vec256 v) { + const Full256 d; + const RebindToSigned di; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); +} + +// Returns whether normal/subnormal/zero. +template +HWY_API Mask256 IsFinite(const Vec256 v) { + const Full256 d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + const VFromD vu = BitCast(du, v); + // 'Shift left' to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). + const VFromD exp = + BitCast(di, ShiftRight() + 1>(Add(vu, vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); +} + +// ================================================== COMPARE + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template +HWY_API Mask256 RebindMask(Full256 /*tag*/, Mask256 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask256{m.raw}; +} + +template +HWY_API Mask256 TestBit(Vec256 v, Vec256 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +// Unsigned +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i8x16_eq(a.raw, b.raw)}; +} +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i16x8_eq(a.raw, b.raw)}; +} +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i32x4_eq(a.raw, b.raw)}; +} + +// Signed +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i8x16_eq(a.raw, b.raw)}; +} +HWY_API Mask256 operator==(Vec256 a, Vec256 b) { + return Mask256{wasm_i16x8_eq(a.raw, b.raw)}; +} +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i32x4_eq(a.raw, b.raw)}; +} + +// Float +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_f32x4_eq(a.raw, b.raw)}; +} + +// ------------------------------ Inequality + +// Unsigned +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i8x16_ne(a.raw, b.raw)}; +} +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i16x8_ne(a.raw, b.raw)}; +} +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i32x4_ne(a.raw, b.raw)}; +} + +// Signed +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i8x16_ne(a.raw, b.raw)}; +} +HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { + return Mask256{wasm_i16x8_ne(a.raw, b.raw)}; +} +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i32x4_ne(a.raw, b.raw)}; +} + +// Float +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_f32x4_ne(a.raw, b.raw)}; +} + +// ------------------------------ Strict inequality + +HWY_API Mask256 operator>(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i8x16_gt(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i16x8_gt(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_i32x4_gt(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(const Vec256 a, + const Vec256 b) { + const Rebind < int32_t, DFromV d32; + const auto a32 = BitCast(d32, a); + const auto b32 = BitCast(d32, b); + // If the upper half is less than or greater, this is the answer. + const auto m_gt = a32 < b32; + + // Otherwise, the lower half decides. + const auto m_eq = a32 == b32; + const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0); + const auto lo_gt = And(m_eq, lo_in_hi); + + const auto gt = Or(lo_gt, m_gt); + // Copy result in upper 32 bits to lower 32 bits. + return Mask256{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)}; +} + +template +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + const Full256 du; + const RebindToSigned di; + const Vec256 msb = Set(du, (LimitsMax() >> 1) + 1); + return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb))); +} + +HWY_API Mask256 operator>(const Vec256 a, const Vec256 b) { + return Mask256{wasm_f32x4_gt(a.raw, b.raw)}; +} + +template +HWY_API Mask256 operator<(const Vec256 a, const Vec256 b) { + return operator>(b, a); +} + +// ------------------------------ Weak inequality + +// Float <= >= +HWY_API Mask256 operator<=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_f32x4_le(a.raw, b.raw)}; +} +HWY_API Mask256 operator>=(const Vec256 a, + const Vec256 b) { + return Mask256{wasm_f32x4_ge(a.raw, b.raw)}; +} + +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask256 FirstN(const Full256 d, size_t num) { + const RebindToSigned di; // Signed comparisons may be cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); +} + +// ================================================== LOGICAL + +// ------------------------------ Not + +template +HWY_API Vec256 Not(Vec256 v) { + return Vec256{wasm_v128_not(v.raw)}; +} + +// ------------------------------ And + +template +HWY_API Vec256 And(Vec256 a, Vec256 b) { + return Vec256{wasm_v128_and(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template +HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { + return Vec256{wasm_v128_andnot(mask.raw, not_mask.raw)}; +} + +// ------------------------------ Or + +template +HWY_API Vec256 Or(Vec256 a, Vec256 b) { + return Vec256{wasm_v128_or(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template +HWY_API Vec256 Xor(Vec256 a, Vec256 b) { + return Vec256{wasm_v128_xor(a.raw, b.raw)}; +} + +// ------------------------------ Or3 + +template +HWY_API Vec256 Or3(Vec256 o1, Vec256 o2, Vec256 o3) { + return Or(o1, Or(o2, o3)); +} + +// ------------------------------ OrAnd + +template +HWY_API Vec256 OrAnd(Vec256 o, Vec256 a1, Vec256 a2) { + return Or(o, And(a1, a2)); +} + +// ------------------------------ IfVecThenElse + +template +HWY_API Vec256 IfVecThenElse(Vec256 mask, Vec256 yes, Vec256 no) { + return IfThenElse(MaskFromVec(mask), yes, no); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template +HWY_API Vec256 operator&(const Vec256 a, const Vec256 b) { + return And(a, b); +} + +template +HWY_API Vec256 operator|(const Vec256 a, const Vec256 b) { + return Or(a, b); +} + +template +HWY_API Vec256 operator^(const Vec256 a, const Vec256 b) { + return Xor(a, b); +} + +// ------------------------------ CopySign + +template +HWY_API Vec256 CopySign(const Vec256 magn, const Vec256 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + const auto msb = SignBit(Full256()); + return Or(AndNot(msb, magn), And(msb, sign)); +} + +template +HWY_API Vec256 CopySignToAbs(const Vec256 abs, const Vec256 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + return Or(abs, And(SignBit(Full256()), sign)); +} + +// ------------------------------ BroadcastSignBit (compare) + +template +HWY_API Vec256 BroadcastSignBit(const Vec256 v) { + return ShiftRight(v); +} +HWY_API Vec256 BroadcastSignBit(const Vec256 v) { + return VecFromMask(Full256(), v < Zero(Full256())); +} + +// ------------------------------ Mask + +// Mask and Vec are the same (true = FF..FF). +template +HWY_API Mask256 MaskFromVec(const Vec256 v) { + return Mask256{v.raw}; +} + +template +HWY_API Vec256 VecFromMask(Full256 /* tag */, Mask256 v) { + return Vec256{v.raw}; +} + +// mask ? yes : no +template +HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { + return Vec256{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; +} + +// mask ? yes : 0 +template +HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { + return yes & VecFromMask(Full256(), mask); +} + +// mask ? 0 : no +template +HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { + return AndNot(VecFromMask(Full256(), mask), no); +} + +template + HWY_API Vec256 < + T IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { + HWY_ASSERT(0); // Not implemented +} + +template +HWY_API Vec256 ZeroIfNegative(Vec256 v) { + const Full256 d; + const auto zero = Zero(d); + return IfThenElse(Mask256{(v > zero).raw}, v, zero); +} + +// ------------------------------ Mask logical + +template +HWY_API Mask256 Not(const Mask256 m) { + return MaskFromVec(Not(VecFromMask(Full256(), m))); +} + +template +HWY_API Mask256 And(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 Or(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +// ------------------------------ Shl (BroadcastSignBit, IfThenElse) + +// The x86 multiply-by-Pow2() trick will not work because WASM saturates +// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a +// scalar count operand, per-lane shift instructions would require extract_lane +// for each lane, and hoping that shuffle is correctly mapped to a native +// instruction. Using non-vector shifts would incur a store-load forwarding +// stall when loading the result vector. We instead test bits of the shift +// count to "predicate" a shift of the entire vector by a constant. + +template +HWY_API Vec256 operator<<(Vec256 v, const Vec256 bits) { + const Full256 d; + Mask256 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<12>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftLeft<1>(v), v); +} + +template +HWY_API Vec256 operator<<(Vec256 v, const Vec256 bits) { + const Full256 d; + Mask256 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<27>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<16>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftLeft<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftLeft<1>(v), v); +} + +// ------------------------------ Shr (BroadcastSignBit, IfThenElse) + +template +HWY_API Vec256 operator>>(Vec256 v, const Vec256 bits) { + const Full256 d; + Mask256 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<12>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftRight<1>(v), v); +} + +template +HWY_API Vec256 operator>>(Vec256 v, const Vec256 bits) { + const Full256 d; + Mask256 mask; + // Need a signed type for BroadcastSignBit. + auto test = BitCast(RebindToSigned(), bits); + // Move the highest valid bit of the shift count into the sign bit. + test = ShiftLeft<27>(test); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<16>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<8>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<4>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + test = ShiftLeft<1>(test); // next bit (descending order) + v = IfThenElse(mask, ShiftRight<2>(v), v); + + mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); + return IfThenElse(mask, ShiftRight<1>(v), v); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template +HWY_API Vec256 Load(Full256 /* tag */, const T* HWY_RESTRICT aligned) { + return Vec256{wasm_v128_load(aligned)}; +} + +template +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 d, + const T* HWY_RESTRICT aligned) { + return IfThenElseZero(m, Load(d, aligned)); +} + +// LoadU == Load. +template +HWY_API Vec256 LoadU(Full256 d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template +HWY_API Vec256 LoadDup128(Full256 d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// ------------------------------ Store + +template +HWY_API void Store(Vec256 v, Full256 /* tag */, T* HWY_RESTRICT aligned) { + wasm_v128_store(aligned, v.raw); +} + +// StoreU == Store. +template +HWY_API void StoreU(Vec256 v, Full256 d, T* HWY_RESTRICT p) { + Store(v, d, p); +} + +template +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT p) { + StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); +} + +// ------------------------------ Non-temporal stores + +// Same as aligned stores on non-x86. + +template +HWY_API void Stream(Vec256 v, Full256 /* tag */, + T* HWY_RESTRICT aligned) { + wasm_v128_store(aligned, v.raw); +} + +// ------------------------------ Scatter (Store) + +template +HWY_API void ScatterOffset(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(32) T lanes[32 / sizeof(T)]; + Store(v, d, lanes); + + alignas(32) Offset offset_lanes[32 / sizeof(T)]; + Store(offset, Full256(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(32) T lanes[32 / sizeof(T)]; + Store(v, d, lanes); + + alignas(32) Index index_lanes[32 / sizeof(T)]; + Store(index, Full256(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +// ------------------------------ Gather (Load/Store) + +template +HWY_API Vec256 GatherOffset(const Full256 d, const T* HWY_RESTRICT base, + const Vec256 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(32) Offset offset_lanes[32 / sizeof(T)]; + Store(offset, Full256(), offset_lanes); + + alignas(32) T lanes[32 / sizeof(T)]; + const uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template +HWY_API Vec256 GatherIndex(const Full256 d, const T* HWY_RESTRICT base, + const Vec256 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(32) Index index_lanes[32 / sizeof(T)]; + Store(index, Full256(), index_lanes); + + alignas(32) T lanes[32 / sizeof(T)]; + for (size_t i = 0; i < N; ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +// ================================================== SWIZZLE + +// ------------------------------ ExtractLane +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ InsertLane +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ GetLane +// Gets the single value stored in a vector/part. +HWY_API uint8_t GetLane(const Vec256 v) { + return wasm_i8x16_extract_lane(v.raw, 0); +} +HWY_API int8_t GetLane(const Vec256 v) { + return wasm_i8x16_extract_lane(v.raw, 0); +} +HWY_API uint16_t GetLane(const Vec256 v) { + return wasm_i16x8_extract_lane(v.raw, 0); +} +HWY_API int16_t GetLane(const Vec256 v) { + return wasm_i16x8_extract_lane(v.raw, 0); +} +HWY_API uint32_t GetLane(const Vec256 v) { + return wasm_i32x4_extract_lane(v.raw, 0); +} +HWY_API int32_t GetLane(const Vec256 v) { + return wasm_i32x4_extract_lane(v.raw, 0); +} +HWY_API uint64_t GetLane(const Vec256 v) { + return wasm_i64x2_extract_lane(v.raw, 0); +} +HWY_API int64_t GetLane(const Vec256 v) { + return wasm_i64x2_extract_lane(v.raw, 0); +} + +HWY_API float GetLane(const Vec256 v) { + return wasm_f32x4_extract_lane(v.raw, 0); +} + +// ------------------------------ LowerHalf + +template +HWY_API Vec128 LowerHalf(Full128 /* tag */, Vec256 v) { + return Vec128{v.raw}; +} + +template +HWY_API Vec128 LowerHalf(Vec256 v) { + return LowerHalf(Full128(), v); +} + +// ------------------------------ ShiftLeftBytes + +// 0x01..0F, kBytes = 1 => 0x02..0F00 +template +HWY_API Vec256 ShiftLeftBytes(Full256 /* tag */, Vec256 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const __i8x16 zero = wasm_i8x16_splat(0); + switch (kBytes) { + case 0: + return v; + + case 1: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14)}; + + case 2: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13)}; + + case 3: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12)}; + + case 4: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11)}; + + case 5: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10)}; + + case 6: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; + + case 7: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; + + case 8: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; + + case 9: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; + + case 10: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; + + case 11: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; + + case 12: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; + + case 13: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; + + case 14: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 0, + 1)}; + + case 15: + return Vec256{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 0)}; + } + return Vec256{zero}; +} + +template +HWY_API Vec256 ShiftLeftBytes(Vec256 v) { + return ShiftLeftBytes(Full256(), v); +} + +// ------------------------------ ShiftLeftLanes + +template +HWY_API Vec256 ShiftLeftLanes(Full256 d, const Vec256 v) { + const Repartition d8; + return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); +} + +template +HWY_API Vec256 ShiftLeftLanes(const Vec256 v) { + return ShiftLeftLanes(Full256(), v); +} + +// ------------------------------ ShiftRightBytes +namespace detail { + +// Helper function allows zeroing invalid lanes in caller. +template +HWY_API __i8x16 ShrBytes(const Vec256 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + const __i8x16 zero = wasm_i8x16_splat(0); + + switch (kBytes) { + case 0: + return v.raw; + + case 1: + return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16); + + case 2: + return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16); + + case 3: + return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16, 16); + + case 4: + return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 16, 16, 16); + + case 5: + return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 16, 16, 16, 16); + + case 6: + return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 16, 16, 16, 16, 16); + + case 7: + return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 16, 16, 16, 16, 16, 16); + + case 8: + return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 9: + return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 10: + return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 11: + return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 12: + return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 13: + return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 14: + return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + + case 15: + return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16); + case 16: + return zero; + } +} + +} // namespace detail + +// 0x01..0F, kBytes = 1 => 0x0001..0E +template +HWY_API Vec256 ShiftRightBytes(Full256 /* tag */, Vec256 v) { + return Vec256{detail::ShrBytes(v)}; +} + +// ------------------------------ ShiftRightLanes +template +HWY_API Vec256 ShiftRightLanes(Full256 d, const Vec256 v) { + const Repartition d8; + return BitCast(d, ShiftRightBytes(BitCast(d8, v))); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +// Full input: copy hi into lo (smaller instruction encoding than shifts). +template +HWY_API Vec128 UpperHalf(Full128 /* tag */, + const Vec256 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; +} +HWY_API Vec128 UpperHalf(Full128 /* tag */, + const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; +} + +// ------------------------------ CombineShiftRightBytes + +template > +HWY_API V CombineShiftRightBytes(Full256 /* tag */, V hi, V lo) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + switch (kBytes) { + case 0: + return lo; + + case 1: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16)}; + + case 2: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17)}; + + case 3: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18)}; + + case 4: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19)}; + + case 5: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20)}; + + case 6: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21)}; + + case 7: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22)}; + + case 8: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23)}; + + case 9: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24)}; + + case 10: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25)}; + + case 11: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26)}; + + case 12: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27)}; + + case 13: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28)}; + + case 14: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29)}; + + case 15: + return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30)}; + } + return hi; +} + +// ------------------------------ Broadcast/splat any lane + +// Unsigned +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec256{wasm_i16x8_shuffle( + v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; +} +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec256{ + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; +} + +// Signed +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec256{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, + kLane, kLane, kLane, kLane, kLane)}; +} +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec256{ + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; +} + +// Float +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec256{ + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; +} + +// ------------------------------ TableLookupBytes + +// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. +// lane indices in [0, 16). +template +HWY_API Vec256 TableLookupBytes(const Vec256 bytes, + const Vec256 from) { +// Not yet available in all engines, see +// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md +// V8 implementation of this had a bug, fixed on 2021-04-03: +// https://chromium-review.googlesource.com/c/v8/v8/+/2822951 +#if 0 + return Vec256{wasm_i8x16_swizzle(bytes.raw, from.raw)}; +#else + alignas(32) uint8_t control[16]; + alignas(32) uint8_t input[16]; + alignas(32) uint8_t output[16]; + wasm_v128_store(control, from.raw); + wasm_v128_store(input, bytes.raw); + for (size_t i = 0; i < 16; ++i) { + output[i] = control[i] < 16 ? input[control[i]] : 0; + } + return Vec256{wasm_v128_load(output)}; +#endif +} + +template +HWY_API Vec256 TableLookupBytesOr0(const Vec256 bytes, + const Vec256 from) { + const Full256 d; + // Mask size must match vector type, so cast everything to this type. + Repartition di8; + Repartition> d_bytes8; + const auto msb = BitCast(di8, from) < Zero(di8); + const auto lookup = + TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); + return BitCast(d, IfThenZeroElse(msb, lookup)); +} + +// ------------------------------ Hard-coded shuffles + +// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +HWY_API Vec128 Shuffle2301(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; +} +HWY_API Vec128 Shuffle2301(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; +} +HWY_API Vec128 Shuffle2301(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; +} + +// Swap 64-bit halves +HWY_API Vec128 Shuffle1032(const Vec128 v) { + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; +} +HWY_API Vec128 Shuffle1032(const Vec128 v) { + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; +} +HWY_API Vec128 Shuffle1032(const Vec128 v) { + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; +} + +// Rotate right 32 bits +HWY_API Vec128 Shuffle0321(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; +} +HWY_API Vec128 Shuffle0321(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; +} +HWY_API Vec128 Shuffle0321(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; +} +// Rotate left 32 bits +HWY_API Vec128 Shuffle2103(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; +} +HWY_API Vec128 Shuffle2103(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; +} +HWY_API Vec128 Shuffle2103(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; +} + +// Reverse +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; +} +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; +} +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices for use by TableLookupLanes. +template +struct Indices256 { + __v128_u raw; +}; + +template +HWY_API Indices256 IndicesFromVec(Full256 d, Vec256 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); + return Indices256{}; +} + +template +HWY_API Indices256 SetTableIndices(Full256 d, const TI* idx) { + const Rebind di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template +HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { + using TI = MakeSigned; + const Full256 d; + const Full256 di; + return BitCast(d, TableLookupBytes(BitCast(di, v), Vec256{idx.raw})); +} + +// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) + +template +HWY_API Vec256 Reverse(Full256 /* tag */, const Vec256 v) { + return Shuffle01(v); +} + +// Four lanes: shuffle +template +HWY_API Vec256 Reverse(Full256 /* tag */, const Vec256 v) { + return Shuffle0123(v); +} + +// 16-bit +template +HWY_API Vec256 Reverse(Full256 d, const Vec256 v) { + const RepartitionToWide> du32; + return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); +} + +// ------------------------------ Reverse2 + +template +HWY_API Vec256 Reverse2(Full256 d, const Vec256 v) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ Reverse4 + +template +HWY_API Vec256 Reverse4(Full256 d, const Vec256 v) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ Reverse8 + +template +HWY_API Vec256 Reverse8(Full256 d, const Vec256 v) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ InterleaveLower + +HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { + return Vec256{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; +} +HWY_API Vec256 InterleaveLower(Vec256 a, + Vec256 b) { + return Vec256{ + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; +} +HWY_API Vec256 InterleaveLower(Vec256 a, + Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} +HWY_API Vec256 InterleaveLower(Vec256 a, + Vec256 b) { + return Vec256{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { + return Vec256{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, + 19, 4, 20, 5, 21, 6, 22, 7, 23)}; +} +HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { + return Vec256{ + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; +} +HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} +HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { + return Vec256{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; +} + +HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; +} + +// Additional overload for the optional tag. +template > +HWY_API V InterleaveLower(Full256 /* tag */, V a, V b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// All functions inside detail lack the required D parameter. +namespace detail { + +HWY_API Vec256 InterleaveUpper(Vec256 a, Vec256 b) { + return Vec256{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, + 11, 27, 12, 28, 13, 29, 14, 30, 15, + 31)}; +} +HWY_API Vec256 InterleaveUpper(Vec256 a, + Vec256 b) { + return Vec256{ + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; +} +HWY_API Vec256 InterleaveUpper(Vec256 a, + Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} +HWY_API Vec256 InterleaveUpper(Vec256 a, + Vec256 b) { + return Vec256{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +HWY_API Vec256 InterleaveUpper(Vec256 a, Vec256 b) { + return Vec256{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, + 11, 27, 12, 28, 13, 29, 14, 30, 15, + 31)}; +} +HWY_API Vec256 InterleaveUpper(Vec256 a, Vec256 b) { + return Vec256{ + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; +} +HWY_API Vec256 InterleaveUpper(Vec256 a, Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} +HWY_API Vec256 InterleaveUpper(Vec256 a, Vec256 b) { + return Vec256{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; +} + +HWY_API Vec256 InterleaveUpper(Vec256 a, Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; +} + +} // namespace detail + +template > +HWY_API V InterleaveUpper(Full256 /* tag */, V a, V b) { + return detail::InterleaveUpper(a, b); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template >> +HWY_API VFromD ZipLower(Vec256 a, Vec256 b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template , class DW = RepartitionToWide> +HWY_API VFromD ZipLower(DW dw, Vec256 a, Vec256 b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template , class DW = RepartitionToWide> +HWY_API VFromD ZipUpper(DW dw, Vec256 a, Vec256 b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) + +// N = N/2 + N/2 (upper half undefined) +template +HWY_API Vec256 Combine(Full256 d, Vec128 hi_half, Vec128 lo_half) { + const Half d2; + const RebindToUnsigned du2; + // Treat half-width input as one lane, and expand to two lanes. + using VU = Vec128, 2>; + const VU lo{BitCast(du2, lo_half).raw}; + const VU hi{BitCast(du2, hi_half).raw}; + return BitCast(d, InterleaveLower(lo, hi)); +} + +// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) + +template +HWY_API Vec256 ZeroExtendVector(Full256 d, Vec128 lo) { + return IfThenElseZero(FirstN(d, 16 / sizeof(T)), Vec256{lo.raw}); +} + +// ------------------------------ ConcatLowerLower + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template +HWY_API Vec256 ConcatLowerLower(Full256 /* tag */, const Vec256 hi, + const Vec256 lo) { + return Vec256{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; +} + +// ------------------------------ ConcatUpperUpper + +template +HWY_API Vec256 ConcatUpperUpper(Full256 /* tag */, const Vec256 hi, + const Vec256 lo) { + return Vec256{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; +} + +// ------------------------------ ConcatLowerUpper + +template +HWY_API Vec256 ConcatLowerUpper(Full256 d, const Vec256 hi, + const Vec256 lo) { + return CombineShiftRightBytes<8>(d, hi, lo); +} + +// ------------------------------ ConcatUpperLower +template +HWY_API Vec256 ConcatUpperLower(Full256 d, const Vec256 hi, + const Vec256 lo) { + return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); +} + +// ------------------------------ ConcatOdd + +// 32-bit +template +HWY_API Vec256 ConcatOdd(Full256 /* tag */, Vec256 hi, Vec256 lo) { + return Vec256{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; +} + +// 64-bit full - no partial because we need at least two inputs to have +// even/odd. +template +HWY_API Vec256 ConcatOdd(Full256 /* tag */, Vec256 hi, Vec256 lo) { + return InterleaveUpper(Full256(), lo, hi); +} + +// ------------------------------ ConcatEven (InterleaveLower) + +// 32-bit full +template +HWY_API Vec256 ConcatEven(Full256 /* tag */, Vec256 hi, Vec256 lo) { + return Vec256{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; +} + +// 64-bit full - no partial because we need at least two inputs to have +// even/odd. +template +HWY_API Vec256 ConcatEven(Full256 /* tag */, Vec256 hi, Vec256 lo) { + return InterleaveLower(Full256(), lo, hi); +} + +// ------------------------------ DupEven +template +HWY_API Vec256 DupEven(Vec256 v) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ DupOdd +template +HWY_API Vec256 DupOdd(Vec256 v) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ OddEven + +namespace detail { + +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<1> /* tag */, const Vec256 a, + const Vec256 b) { + const Full256 d; + const Repartition d8; + alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); +} +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<2> /* tag */, const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; +} +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<4> /* tag */, const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; +} +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<8> /* tag */, const Vec256 a, + const Vec256 b) { + return Vec256{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; +} + +} // namespace detail + +template +HWY_API Vec256 OddEven(const Vec256 a, const Vec256 b) { + return detail::OddEven(hwy::SizeTag(), a, b); +} +HWY_API Vec256 OddEven(const Vec256 a, const Vec256 b) { + return Vec256{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; +} + +// ------------------------------ OddEvenBlocks +template +HWY_API Vec256 OddEvenBlocks(Vec256 /* odd */, Vec256 even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { + return v; +} + +// ------------------------------ ReverseBlocks + +template +HWY_API Vec256 ReverseBlocks(Full256 /* tag */, const Vec256 v) { + return v; +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend. +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{wasm_u16x8_extend_low_u8x16(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{ + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{wasm_u16x8_extend_low_u8x16(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{ + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{wasm_u32x4_extend_low_u16x8(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{wasm_u32x4_extend_low_u16x8(v.raw)}; +} + +// Signed: replicate sign bit. +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{wasm_i16x8_extend_low_i8x16(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{ + wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{wasm_i32x4_extend_low_i16x8(v.raw)}; +} + +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{wasm_f64x2_convert_low_i32x4(v.raw)}; +} + +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + const Full256 di32; + const Full256 du32; + const Full256 df32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, Vec256{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +} + +HWY_API Vec256 PromoteTo(Full256 df32, + const Vec128 v) { + const Rebind du16; + const RebindToSigned di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + return Vec128{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + return Vec128{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return Vec128{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + return Vec128{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return Vec128{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + return Vec128{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* di */, + const Vec256 v) { + return Vec128{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const Full256 di; + const Full256 du; + const Full256 du16; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return Vec128{DemoteTo(du16, bits16).raw}; +} + +HWY_API Vec128 DemoteTo(Full128 dbf16, + const Vec256 v) { + const Rebind di32; + const Rebind du32; // for logical shift right + const Rebind du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +HWY_API Vec128 ReorderDemote2To(Full128 dbf16, + Vec256 a, Vec256 b) { + const RebindToUnsigned du16; + const Repartition du32; + const Vec256 b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +HWY_API Vec512 ReorderDemote2To(Full512 /*d16*/, + Vec512 a, Vec512 b) { + return Vec512{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; +} + +// For already range-limited input [0, 255]. +HWY_API Vec256 U8FromU32(const Vec256 v) { + const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); + return Vec256{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; +} + +// ------------------------------ Truncations + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec256 v) { + return Vec256{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24, + 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, + 16, 24)}; +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec256 v) { + return Vec256{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9, + 16, 17, 24, 25, 0, 1, 8, 9, 16, + 17, 24, 25)}; +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec256 v) { + return Vec256{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3, + 8, 9, 10, 11, 16, 17, 18, 19, + 24, 25, 26, 27)}; +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec256 v) { + return Vec256{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12, + 16, 20, 24, 28, 0, 4, 8, 12, 16, + 20, 24, 28)}; +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec256 v) { + return Vec256{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5, + 8, 9, 12, 13, 16, 17, 20, 21, + 24, 25, 28, 29)}; +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec256 v) { + return Vec256{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6, + 8, 10, 12, 14, 16, 18, 20, 22, + 24, 26, 28, 30)}; +} + +// ------------------------------ Convert i32 <=> f32 (Round) + +HWY_API Vec256 ConvertTo(Full256 /* tag */, + const Vec256 v) { + return Vec256{wasm_f32x4_convert_i32x4(v.raw)}; +} +HWY_API Vec256 ConvertTo(Full256 /* tag */, + const Vec256 v) { + return Vec256{wasm_f32x4_convert_u32x4(v.raw)}; +} +// Truncates (rounds toward zero). +HWY_API Vec256 ConvertTo(Full256 /* tag */, + const Vec256 v) { + return Vec256{wasm_i32x4_trunc_sat_f32x4(v.raw)}; +} + +HWY_API Vec256 NearestInt(const Vec256 v) { + return ConvertTo(Full256(), Round(v)); +} + +// ================================================== MISC + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +template +HWY_INLINE Mask256 LoadMaskBits(Full256 d, uint64_t bits) { + const RebindToUnsigned du; + // Easier than Set(), which would require an >8-bit type, which would not + // compile for T=uint8_t, N=1. + const Vec256 vbits{wasm_i32x4_splat(static_cast(bits))}; + + // Replicate bytes 8x such that each byte contains the bit that governs it. + alignas(32) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; + const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); + + alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template +HWY_INLINE Mask256 LoadMaskBits(Full256 d, uint64_t bits) { + const RebindToUnsigned du; + alignas(32) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); +} + +template +HWY_INLINE Mask256 LoadMaskBits(Full256 d, uint64_t bits) { + const RebindToUnsigned du; + alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8}; + return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); +} + +template +HWY_INLINE Mask256 LoadMaskBits(Full256 d, uint64_t bits) { + const RebindToUnsigned du; + alignas(32) constexpr uint64_t kBit[8] = {1, 2}; + return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask256 LoadMaskBits(Full256 d, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + CopyBytes<(N + 7) / 8>(bits, &mask_bits); + return detail::LoadMaskBits(d, mask_bits); +} + +// ------------------------------ Mask + +namespace detail { + +// Full +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { + alignas(32) uint64_t lanes[2]; + wasm_v128_store(lanes, mask.raw); + + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + const uint64_t lo = ((lanes[0] * kMagic) >> 56); + const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; + return (hi + lo); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask256 mask) { + // Remove useless lower half of each u16 while preserving the sign bit. + const __i16x8 zero = wasm_i16x8_splat(0); + const Mask256 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; + return BitsFromMask(hwy::SizeTag<1>(), mask8); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, + const Mask256 mask) { + const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); + const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); + alignas(32) uint32_t lanes[4]; + wasm_v128_store(lanes, sliced_mask); + return lanes[0] | lanes[1] | lanes[2] | lanes[3]; +} + +// Returns 0xFF for bytes with index >= N, otherwise 0. +constexpr __i8x16 BytesAbove() { + return /**/ + (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) + : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) + : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) + : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) + : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) + : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) + : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) + : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) + : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) + : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1) + : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1) + : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1) + : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, + -1, -1, -1) + : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, + -1, -1, -1) + : (N == 11) + ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) + : (N == 13) + ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) + : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); +} + +template +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { + return BitsFromMask(hwy::SizeTag(), mask); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { + return PopCount(BitsFromMask(tag, m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { + return PopCount(BitsFromMask(tag, m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { + const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); + alignas(32) uint64_t lanes[2]; + wasm_v128_store(lanes, shifted_bits); + return PopCount(lanes[0] | lanes[1]); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(const Full256 /* tag */, const Mask256 mask, + uint8_t* bits) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const size_t kNumBytes = (N + 7) / 8; + CopyBytes(&mask_bits, bits); + return kNumBytes; +} + +template +HWY_API size_t CountTrue(const Full256 /* tag */, const Mask128 m) { + return detail::CountTrue(hwy::SizeTag(), m); +} + +template +HWY_API bool AllFalse(const Full256 d, const Mask128 m) { +#if 0 + // Casting followed by wasm_i8x16_any_true results in wasm error: + // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128 + const auto v8 = BitCast(Full256(), VecFromMask(d, m)); + return !wasm_i8x16_any_true(v8.raw); +#else + (void)d; + return (wasm_i64x2_extract_lane(m.raw, 0) | + wasm_i64x2_extract_lane(m.raw, 1)) == 0; +#endif +} + +// Full vector +namespace detail { +template +HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { + return wasm_i8x16_all_true(m.raw); +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { + return wasm_i16x8_all_true(m.raw); +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { + return wasm_i32x4_all_true(m.raw); +} + +} // namespace detail + +template +HWY_API bool AllTrue(const Full256 /* tag */, const Mask128 m) { + return detail::AllTrue(hwy::SizeTag(), m); +} + +template +HWY_API size_t FindKnownFirstTrue(const Full256 /* tag */, + const Mask256 mask) { + const uint64_t bits = detail::BitsFromMask(mask); + return Num0BitsBelowLS1Bit_Nonzero64(bits); +} + +template +HWY_API intptr_t FindFirstTrue(const Full256 /* tag */, + const Mask256 mask) { + const uint64_t bits = detail::BitsFromMask(mask); + return bits ? Num0BitsBelowLS1Bit_Nonzero64(bits) : -1; +} + +// ------------------------------ Compress + +namespace detail { + +template +HWY_INLINE Vec256 Idx16x8FromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Full256 d; + const Rebind d8; + const Full256 du; + + // We need byte indices for TableLookupBytes (one vector's worth for each of + // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We + // can instead store lane indices and convert to byte indices (2*lane + 0..1), + // with the doubling baked into the table. Unpacking nibbles is likely more + // costly than the higher cache footprint from storing bytes. + alignas(32) constexpr uint8_t table[256 * 8] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, + 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, + 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, + 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, + 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, + 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, + 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, + 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, + 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, + 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, + 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, + 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, + 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, + 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, + 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, + 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, + 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, + 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, + 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, + 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, + 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, + 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, + 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, + 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, + 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, + 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, + 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, + 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, + 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, + 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, + 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, + 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, + 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, + 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, + 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, + 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, + 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, + 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, + 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, + 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, + 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, + 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, + 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, + 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, + 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, + 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, + 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, + 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, + 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, + 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, + 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, + 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, + 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, + 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, + 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, + 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, + 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, + 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, + 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, + 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, + 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, + 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, + 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, + 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, + 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, + 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, + 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, + 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, + 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, + 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, + 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, + 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, + 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, + 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, + 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, + 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, + 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, + 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, + 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, + 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, + 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, + 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, + 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, + 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, + 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, + 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, + 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, + 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, + 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, + 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, + 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, + 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, + 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, + 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, + 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, + 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, + 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, + 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, + 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, + 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, + 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, + 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, + 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, + 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, + 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, + 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, + 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, + 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec256 byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec256 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec256 Idx32x4FromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(32) constexpr uint8_t packed_array[16 * 16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // + 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, // + 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // + 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, // + 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // + 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Full256 d; + const Repartition d8; + return BitCast(d, Load(d8, packed_array + 16 * mask_bits)); +} + +#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64 + +template +HWY_INLINE Vec256 Idx64x2FromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(32) constexpr uint8_t packed_array[4 * 16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Full256 d; + const Repartition d8; + return BitCast(d, Load(d8, packed_array + 16 * mask_bits)); +} + +#endif + +// Helper functions called by both Compress and CompressStore - avoids a +// redundant BitsFromMask in the latter. + +template +HWY_INLINE Vec256 Compress(hwy::SizeTag<2> /*tag*/, Vec256 v, + const uint64_t mask_bits) { + const auto idx = detail::Idx16x8FromBits(mask_bits); + using D = Full256; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +template +HWY_INLINE Vec256 Compress(hwy::SizeTag<4> /*tag*/, Vec256 v, + const uint64_t mask_bits) { + const auto idx = detail::Idx32x4FromBits(mask_bits); + using D = Full256; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64 + +template +HWY_INLINE Vec256 Compress(hwy::SizeTag<8> /*tag*/, + Vec256 v, + const uint64_t mask_bits) { + const auto idx = detail::Idx64x2FromBits(mask_bits); + using D = Full256; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); +} + +#endif + +} // namespace detail + +template +struct CompressIsPartition { + enum { value = 1 }; +}; + +template +HWY_API Vec256 Compress(Vec256 v, const Mask256 mask) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + return detail::Compress(hwy::SizeTag(), v, mask_bits); +} + +// ------------------------------ CompressNot +template +HWY_API Vec256 Compress(Vec256 v, const Mask256 mask) { + return Compress(v, Not(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec256 CompressBlocksNot(Vec256 v, + Mask256 mask) { + HWY_ASSERT(0); // Not implemented +} + +// ------------------------------ CompressBits + +template +HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::Compress(hwy::SizeTag(), v, mask_bits); +} + +// ------------------------------ CompressStore +template +HWY_API size_t CompressStore(Vec256 v, const Mask256 mask, Full256 d, + T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + const auto c = detail::Compress(hwy::SizeTag(), v, mask_bits); + StoreU(c, d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT unaligned) { + const RebindToUnsigned du; // so we can support fp16/bf16 + using TU = TFromD; + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + const Mask256 store_mask = FirstN(du, count); + const Vec256 compressed = + detail::Compress(hwy::SizeTag(), BitCast(du, v), mask_bits); + const Vec256 prev = BitCast(du, LoadU(d, unaligned)); + StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned); + return count; +} + +// ------------------------------ CompressBitsStore + +template +HWY_API size_t CompressBitsStore(Vec256 v, const uint8_t* HWY_RESTRICT bits, + Full256 d, T* HWY_RESTRICT unaligned) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + const auto c = detail::Compress(hwy::SizeTag(), v, mask_bits); + StoreU(c, d, unaligned); + return PopCount(mask_bits); +} + +// ------------------------------ StoreInterleaved2/3/4 + +// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in +// generic_ops-inl.h. + +// ------------------------------ MulEven/Odd (Load) + +HWY_INLINE Vec256 MulEven(const Vec256 a, + const Vec256 b) { + alignas(32) uint64_t mul[2]; + mul[0] = + Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 0)), + static_cast(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); + return Load(Full256(), mul); +} + +HWY_INLINE Vec256 MulOdd(const Vec256 a, + const Vec256 b) { + alignas(32) uint64_t mul[2]; + mul[0] = + Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 1)), + static_cast(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); + return Load(Full256(), mul); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +HWY_API Vec256 ReorderWidenMulAccumulate(Full256 df32, + Vec256 a, + Vec256 b, + const Vec256 sum0, + Vec256& sum1) { + const Repartition du16; + const RebindToUnsigned du32; + const Vec256 zero = Zero(du16); + const Vec256 a0 = ZipLower(du32, zero, BitCast(du16, a)); + const Vec256 a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const Vec256 b0 = ZipLower(du32, zero, BitCast(du16, b)); + const Vec256 b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +HWY_API Vec256 ReorderWidenMulAccumulate(Full256 /*d32*/, + Vec256 a, + Vec256 b, + const Vec256 sum0, + Vec256& /*sum1*/) { + return sum0 + Vec256{wasm_i32x4_dot_i16x8(a.raw, b.raw)}; +} + +// ------------------------------ Reductions + +namespace detail { + +// u32/i32/f32: + +template +HWY_INLINE Vec256 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256 v3210) { + const Vec256 v1032 = Shuffle1032(v3210); + const Vec256 v31_20_31_20 = v3210 + v1032; + const Vec256 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template +HWY_INLINE Vec256 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256 v3210) { + const Vec256 v1032 = Shuffle1032(v3210); + const Vec256 v31_20_31_20 = Min(v3210, v1032); + const Vec256 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template +HWY_INLINE Vec256 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256 v3210) { + const Vec256 v1032 = Shuffle1032(v3210); + const Vec256 v31_20_31_20 = Max(v3210, v1032); + const Vec256 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +// u64/i64/f64: + +template +HWY_INLINE Vec256 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256 v10) { + const Vec256 v01 = Shuffle01(v10); + return v10 + v01; +} +template +HWY_INLINE Vec256 MinOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256 v10) { + const Vec256 v01 = Shuffle01(v10); + return Min(v10, v01); +} +template +HWY_INLINE Vec256 MaxOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256 v10) { + const Vec256 v01 = Shuffle01(v10); + return Max(v10, v01); +} + +// u16/i16 +template +HWY_API Vec256 MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256 /*v*/) { + HWY_ASSERT(0); // Not implemented +} +template +HWY_API Vec256 MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256 /*v*/) { + HWY_ASSERT(0); // Not implemented +} + +} // namespace detail + +// Supported for u/i/f 32/64. Returns the same value in each lane. +template +HWY_API Vec256 SumOfLanes(Full256 /* tag */, const Vec256 v) { + return detail::SumOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec256 MinOfLanes(Full256 /* tag */, const Vec256 v) { + return detail::MinOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec256 MaxOfLanes(Full256 /* tag */, const Vec256 v) { + return detail::MaxOfLanes(hwy::SizeTag(), v); +} + +// ------------------------------ Lt128 + +template +HWY_INLINE Mask256 Lt128(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Mask256 Lt128Upper(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Mask256 Eq128(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Mask256 Eq128Upper(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Mask256 Ne128(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Mask256 Ne128Upper(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Vec256 Min128(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Vec256 Max128(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Vec256 Min128Upper(Full256 d, Vec256 a, Vec256 b) {} + +template +HWY_INLINE Vec256 Max128Upper(Full256 d, Vec256 a, Vec256 b) {} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h new file mode 100644 index 0000000..68b156e --- /dev/null +++ b/hwy/ops/x86_128-inl.h @@ -0,0 +1,7485 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL +// operations when compiling for those targets. +// External include guard in highway.h - see comment there. + +// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL +#include "hwy/base.h" + +// Avoid uninitialized warnings in GCC's emmintrin.h - see +// https://github.com/google/highway/issues/710 and pull/902) +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") +HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized") +#endif + +#include +#include +#if HWY_TARGET == HWY_SSSE3 +#include // SSSE3 +#else +#include // SSE4 +#include // CLMUL +#endif +#include +#include +#include // memcpy + +#include "hwy/ops/shared-inl.h" + +#if HWY_IS_MSAN +#include +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +#if HWY_TARGET <= HWY_AVX2 +template +using Full256 = Simd; +#endif + +#if HWY_TARGET <= HWY_AVX3 +template +using Full512 = Simd; +#endif + +namespace detail { + +template +struct Raw128 { + using type = __m128i; +}; +template <> +struct Raw128 { + using type = __m128; +}; +template <> +struct Raw128 { + using type = __m128d; +}; + +} // namespace detail + +template +class Vec128 { + using Raw = typename detail::Raw128::type; + + public: + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec128& operator*=(const Vec128 other) { + return *this = (*this * other); + } + HWY_INLINE Vec128& operator/=(const Vec128 other) { + return *this = (*this / other); + } + HWY_INLINE Vec128& operator+=(const Vec128 other) { + return *this = (*this + other); + } + HWY_INLINE Vec128& operator-=(const Vec128 other) { + return *this = (*this - other); + } + HWY_INLINE Vec128& operator&=(const Vec128 other) { + return *this = (*this & other); + } + HWY_INLINE Vec128& operator|=(const Vec128 other) { + return *this = (*this | other); + } + HWY_INLINE Vec128& operator^=(const Vec128 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +template +using Vec64 = Vec128; + +template +using Vec32 = Vec128; + +#if HWY_TARGET <= HWY_AVX3 + +// Forward-declare for use by DeduceD, see below. +template +class Vec512; + +namespace detail { + +// Template arg: sizeof(lane type) +template +struct RawMask128 {}; +template <> +struct RawMask128<1> { + using type = __mmask16; +}; +template <> +struct RawMask128<2> { + using type = __mmask8; +}; +template <> +struct RawMask128<4> { + using type = __mmask8; +}; +template <> +struct RawMask128<8> { + using type = __mmask8; +}; + +} // namespace detail + +template +struct Mask128 { + using Raw = typename detail::RawMask128::type; + + static Mask128 FromBits(uint64_t mask_bits) { + return Mask128{static_cast(mask_bits)}; + } + + Raw raw; +}; + +#else // AVX2 or below + +// FF..FF or 0. +template +struct Mask128 { + typename detail::Raw128::type raw; +}; + +#endif // HWY_TARGET <= HWY_AVX3 + +#if HWY_TARGET <= HWY_AVX2 +// Forward-declare for use by DeduceD, see below. +template +class Vec256; +#endif + +namespace detail { + +// Deduce Simd from Vec* (pointers because Vec256/512 may be +// incomplete types at this point; this is simpler than avoiding multiple +// definitions of DFromV via #if) +struct DeduceD { + template + Simd operator()(const Vec128*) const { + return Simd(); + } +#if HWY_TARGET <= HWY_AVX2 + template + Full256 operator()(const hwy::HWY_NAMESPACE::Vec256*) const { + return Full256(); + } +#endif +#if HWY_TARGET <= HWY_AVX3 + template + Full512 operator()(const hwy::HWY_NAMESPACE::Vec512*) const { + return Full512(); + } +#endif +}; + +// Workaround for MSVC v19.14: alias with a dependent type fails to specialize. +template +struct ExpandDFromV { + using type = decltype(DeduceD()(static_cast(nullptr))); +}; + +} // namespace detail + +template +using DFromV = typename detail::ExpandDFromV::type; + +template +using TFromV = TFromD>; + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } +HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); } +HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); } + +template +HWY_INLINE Vec128 BitCastToByte(Vec128 v) { + return Vec128{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template +struct BitCastFromInteger128 { + HWY_INLINE __m128i operator()(__m128i v) { return v; } +}; +template <> +struct BitCastFromInteger128 { + HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); } +}; +template <> +struct BitCastFromInteger128 { + HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); } +}; + +template +HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128{BitCastFromInteger128()(v.raw)}; +} + +} // namespace detail + +template +HWY_API Vec128 BitCast(Simd d, + Vec128 v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Zero + +// Returns an all-zero vector/part. +template +HWY_API Vec128 Zero(Simd /* tag */) { + return Vec128{_mm_setzero_si128()}; +} +template +HWY_API Vec128 Zero(Simd /* tag */) { + return Vec128{_mm_setzero_ps()}; +} +template +HWY_API Vec128 Zero(Simd /* tag */) { + return Vec128{_mm_setzero_pd()}; +} + +template +using VFromD = decltype(Zero(D())); + +// ------------------------------ Set + +// Returns a vector/part with all lanes set to "t". +template +HWY_API Vec128 Set(Simd /* tag */, const uint8_t t) { + return Vec128{_mm_set1_epi8(static_cast(t))}; // NOLINT +} +template +HWY_API Vec128 Set(Simd /* tag */, + const uint16_t t) { + return Vec128{_mm_set1_epi16(static_cast(t))}; // NOLINT +} +template +HWY_API Vec128 Set(Simd /* tag */, + const uint32_t t) { + return Vec128{_mm_set1_epi32(static_cast(t))}; +} +template +HWY_API Vec128 Set(Simd /* tag */, + const uint64_t t) { + return Vec128{ + _mm_set1_epi64x(static_cast(t))}; // NOLINT +} +template +HWY_API Vec128 Set(Simd /* tag */, const int8_t t) { + return Vec128{_mm_set1_epi8(static_cast(t))}; // NOLINT +} +template +HWY_API Vec128 Set(Simd /* tag */, const int16_t t) { + return Vec128{_mm_set1_epi16(static_cast(t))}; // NOLINT +} +template +HWY_API Vec128 Set(Simd /* tag */, const int32_t t) { + return Vec128{_mm_set1_epi32(t)}; +} +template +HWY_API Vec128 Set(Simd /* tag */, const int64_t t) { + return Vec128{ + _mm_set1_epi64x(static_cast(t))}; // NOLINT +} +template +HWY_API Vec128 Set(Simd /* tag */, const float t) { + return Vec128{_mm_set1_ps(t)}; +} +template +HWY_API Vec128 Set(Simd /* tag */, const double t) { + return Vec128{_mm_set1_pd(t)}; +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template +HWY_API Vec128 Undefined(Simd /* tag */) { + // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC + // generate an XOR instruction. + return Vec128{_mm_undefined_si128()}; +} +template +HWY_API Vec128 Undefined(Simd /* tag */) { + return Vec128{_mm_undefined_ps()}; +} +template +HWY_API Vec128 Undefined(Simd /* tag */) { + return Vec128{_mm_undefined_pd()}; +} + +HWY_DIAGNOSTICS(pop) + +// ------------------------------ GetLane + +// Gets the single value stored in a vector/part. +template +HWY_API T GetLane(const Vec128 v) { + return static_cast(_mm_cvtsi128_si32(v.raw) & 0xFF); +} +template +HWY_API T GetLane(const Vec128 v) { + return static_cast(_mm_cvtsi128_si32(v.raw) & 0xFFFF); +} +template +HWY_API T GetLane(const Vec128 v) { + return static_cast(_mm_cvtsi128_si32(v.raw)); +} +template +HWY_API float GetLane(const Vec128 v) { + return _mm_cvtss_f32(v.raw); +} +template +HWY_API uint64_t GetLane(const Vec128 v) { +#if HWY_ARCH_X86_32 + alignas(16) uint64_t lanes[2]; + Store(v, Simd(), lanes); + return lanes[0]; +#else + return static_cast(_mm_cvtsi128_si64(v.raw)); +#endif +} +template +HWY_API int64_t GetLane(const Vec128 v) { +#if HWY_ARCH_X86_32 + alignas(16) int64_t lanes[2]; + Store(v, Simd(), lanes); + return lanes[0]; +#else + return _mm_cvtsi128_si64(v.raw); +#endif +} +template +HWY_API double GetLane(const Vec128 v) { + return _mm_cvtsd_f64(v.raw); +} + +// ================================================== LOGICAL + +// ------------------------------ And + +template +HWY_API Vec128 And(Vec128 a, Vec128 b) { + return Vec128{_mm_and_si128(a.raw, b.raw)}; +} +template +HWY_API Vec128 And(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_and_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 And(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_and_pd(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template +HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { + return Vec128{_mm_andnot_si128(not_mask.raw, mask.raw)}; +} +template +HWY_API Vec128 AndNot(const Vec128 not_mask, + const Vec128 mask) { + return Vec128{_mm_andnot_ps(not_mask.raw, mask.raw)}; +} +template +HWY_API Vec128 AndNot(const Vec128 not_mask, + const Vec128 mask) { + return Vec128{_mm_andnot_pd(not_mask.raw, mask.raw)}; +} + +// ------------------------------ Or + +template +HWY_API Vec128 Or(Vec128 a, Vec128 b) { + return Vec128{_mm_or_si128(a.raw, b.raw)}; +} + +template +HWY_API Vec128 Or(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_or_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 Or(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_or_pd(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template +HWY_API Vec128 Xor(Vec128 a, Vec128 b) { + return Vec128{_mm_xor_si128(a.raw, b.raw)}; +} + +template +HWY_API Vec128 Xor(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_xor_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 Xor(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_xor_pd(a.raw, b.raw)}; +} + +// ------------------------------ Not + +template +HWY_API Vec128 Not(const Vec128 v) { + const DFromV d; + const RebindToUnsigned du; + using VU = VFromD; +#if HWY_TARGET <= HWY_AVX3 + const __m128i vu = BitCast(du, v).raw; + return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)}); +#else + return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)})); +#endif +} + +// ------------------------------ Or3 + +template +HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV d; + const RebindToUnsigned du; + using VU = VFromD; + const __m128i ret = _mm_ternarylogic_epi64( + BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); + return BitCast(d, VU{ret}); +#else + return Or(o1, Or(o2, o3)); +#endif +} + +// ------------------------------ OrAnd + +template +HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV d; + const RebindToUnsigned du; + using VU = VFromD; + const __m128i ret = _mm_ternarylogic_epi64( + BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); + return BitCast(d, VU{ret}); +#else + return Or(o, And(a1, a2)); +#endif +} + +// ------------------------------ IfVecThenElse + +template +HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, + Vec128 no) { +#if HWY_TARGET <= HWY_AVX3 + const DFromV d; + const RebindToUnsigned du; + using VU = VFromD; + return BitCast( + d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw, + BitCast(du, no).raw, 0xCA)}); +#else + return IfThenElse(MaskFromVec(mask), yes, no); +#endif +} + +// ------------------------------ Operator overloads (internal-only if float) + +template +HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { + return And(a, b); +} + +template +HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { + return Or(a, b); +} + +template +HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { + return Xor(a, b); +} + +// ------------------------------ PopulationCount + +// 8/16 require BITALG, 32/64 require VPOPCNTDQ. +#if HWY_TARGET == HWY_AVX3_DL + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, + Vec128 v) { + return Vec128{_mm_popcnt_epi8(v.raw)}; +} +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, + Vec128 v) { + return Vec128{_mm_popcnt_epi16(v.raw)}; +} +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, + Vec128 v) { + return Vec128{_mm_popcnt_epi32(v.raw)}; +} +template +HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, + Vec128 v) { + return Vec128{_mm_popcnt_epi64(v.raw)}; +} + +} // namespace detail + +template +HWY_API Vec128 PopulationCount(Vec128 v) { + return detail::PopulationCount(hwy::SizeTag(), v); +} + +#endif // HWY_TARGET == HWY_AVX3_DL + +// ================================================== SIGN + +// ------------------------------ Neg + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_INLINE Vec128 Neg(hwy::FloatTag /*tag*/, const Vec128 v) { + return Xor(v, SignBit(DFromV())); +} + +template +HWY_INLINE Vec128 Neg(hwy::NonFloatTag /*tag*/, const Vec128 v) { + return Zero(DFromV()) - v; +} + +} // namespace detail + +template +HWY_INLINE Vec128 Neg(const Vec128 v) { + return detail::Neg(hwy::IsFloatTag(), v); +} + +// ------------------------------ Abs + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +template +HWY_API Vec128 Abs(const Vec128 v) { +#if HWY_COMPILER_MSVC + // Workaround for incorrect codegen? (reaches breakpoint) + const auto zero = Zero(DFromV()); + return Vec128{_mm_max_epi8(v.raw, (zero - v).raw)}; +#else + return Vec128{_mm_abs_epi8(v.raw)}; +#endif +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{_mm_abs_epi16(v.raw)}; +} +template +HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{_mm_abs_epi32(v.raw)}; +} +// i64 is implemented after BroadcastSignBit. +template +HWY_API Vec128 Abs(const Vec128 v) { + const Vec128 mask{_mm_set1_epi32(0x7FFFFFFF)}; + return v & BitCast(DFromV(), mask); +} +template +HWY_API Vec128 Abs(const Vec128 v) { + const Vec128 mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)}; + return v & BitCast(DFromV(), mask); +} + +// ------------------------------ CopySign + +template +HWY_API Vec128 CopySign(const Vec128 magn, + const Vec128 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + + const DFromV d; + const auto msb = SignBit(d); + +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned du; + // Truth table for msb, magn, sign | bitwise msb ? sign : mag + // 0 0 0 | 0 + // 0 0 1 | 0 + // 0 1 0 | 1 + // 0 1 1 | 1 + // 1 0 0 | 0 + // 1 0 1 | 1 + // 1 1 0 | 0 + // 1 1 1 | 1 + // The lane size does not matter because we are not using predication. + const __m128i out = _mm_ternarylogic_epi32( + BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC); + return BitCast(d, VFromD{out}); +#else + return Or(AndNot(msb, magn), And(msb, sign)); +#endif +} + +template +HWY_API Vec128 CopySignToAbs(const Vec128 abs, + const Vec128 sign) { +#if HWY_TARGET <= HWY_AVX3 + // AVX3 can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +#else + return Or(abs, And(SignBit(DFromV()), sign)); +#endif +} + +// ================================================== MASK + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ IfThenElse + +// Returns mask ? b : a. + +namespace detail { + +// Templates for signed/unsigned integer of a particular size. +template +HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<1> /* tag */, + Mask128 mask, Vec128 yes, + Vec128 no) { + return Vec128{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<2> /* tag */, + Mask128 mask, Vec128 yes, + Vec128 no) { + return Vec128{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<4> /* tag */, + Mask128 mask, Vec128 yes, + Vec128 no) { + return Vec128{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<8> /* tag */, + Mask128 mask, Vec128 yes, + Vec128 no) { + return Vec128{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)}; +} + +} // namespace detail + +template +HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, + Vec128 no) { + return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); +} + +template +HWY_API Vec128 IfThenElse(Mask128 mask, + Vec128 yes, Vec128 no) { + return Vec128{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)}; +} + +template +HWY_API Vec128 IfThenElse(Mask128 mask, + Vec128 yes, + Vec128 no) { + return Vec128{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)}; +} + +namespace detail { + +template +HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<1> /* tag */, + Mask128 mask, Vec128 yes) { + return Vec128{_mm_maskz_mov_epi8(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<2> /* tag */, + Mask128 mask, Vec128 yes) { + return Vec128{_mm_maskz_mov_epi16(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<4> /* tag */, + Mask128 mask, Vec128 yes) { + return Vec128{_mm_maskz_mov_epi32(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<8> /* tag */, + Mask128 mask, Vec128 yes) { + return Vec128{_mm_maskz_mov_epi64(mask.raw, yes.raw)}; +} + +} // namespace detail + +template +HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { + return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); +} + +template +HWY_API Vec128 IfThenElseZero(Mask128 mask, + Vec128 yes) { + return Vec128{_mm_maskz_mov_ps(mask.raw, yes.raw)}; +} + +template +HWY_API Vec128 IfThenElseZero(Mask128 mask, + Vec128 yes) { + return Vec128{_mm_maskz_mov_pd(mask.raw, yes.raw)}; +} + +namespace detail { + +template +HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<1> /* tag */, + Mask128 mask, Vec128 no) { + // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. + return Vec128{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<2> /* tag */, + Mask128 mask, Vec128 no) { + return Vec128{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<4> /* tag */, + Mask128 mask, Vec128 no) { + return Vec128{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<8> /* tag */, + Mask128 mask, Vec128 no) { + return Vec128{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; +} + +} // namespace detail + +template +HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { + return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); +} + +template +HWY_API Vec128 IfThenZeroElse(Mask128 mask, + Vec128 no) { + return Vec128{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; +} + +template +HWY_API Vec128 IfThenZeroElse(Mask128 mask, + Vec128 no) { + return Vec128{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; +} + +// ------------------------------ Mask logical + +// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently. +#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) +#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \ + HWY_COMPILER_CLANG >= 800 +#define HWY_COMPILER_HAS_MASK_INTRINSICS 1 +#else +#define HWY_COMPILER_HAS_MASK_INTRINSICS 0 +#endif +#endif // HWY_COMPILER_HAS_MASK_INTRINSICS + +namespace detail { + +template +HWY_INLINE Mask128 And(hwy::SizeTag<1> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kand_mask16(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask16>(a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask128 And(hwy::SizeTag<2> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kand_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask128 And(hwy::SizeTag<4> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kand_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask128 And(hwy::SizeTag<8> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kand_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} + +template +HWY_INLINE Mask128 AndNot(hwy::SizeTag<1> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kandn_mask16(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask16>(~a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask128 AndNot(hwy::SizeTag<2> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask128 AndNot(hwy::SizeTag<4> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask128 AndNot(hwy::SizeTag<8> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} + +template +HWY_INLINE Mask128 Or(hwy::SizeTag<1> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kor_mask16(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask16>(a.raw | b.raw)}; +#endif +} +template +HWY_INLINE Mask128 Or(hwy::SizeTag<2> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kor_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} +template +HWY_INLINE Mask128 Or(hwy::SizeTag<4> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kor_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} +template +HWY_INLINE Mask128 Or(hwy::SizeTag<8> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kor_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} + +template +HWY_INLINE Mask128 Xor(hwy::SizeTag<1> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kxor_mask16(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask16>(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask128 Xor(hwy::SizeTag<2> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask128 Xor(hwy::SizeTag<4> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask128 Xor(hwy::SizeTag<8> /*tag*/, const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} + +template +HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, + const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kxnor_mask16(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; +#endif +} +template +HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, + const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{_kxnor_mask8(a.raw, b.raw)}; +#else + return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; +#endif +} +template +HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, + const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; +#else + return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; +#endif +} +template +HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, + const Mask128 a, + const Mask128 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask128{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)}; +#else + return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)}; +#endif +} + +} // namespace detail + +template +HWY_API Mask128 And(const Mask128 a, Mask128 b) { + return detail::And(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { + return detail::AndNot(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask128 Or(const Mask128 a, Mask128 b) { + return detail::Or(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { + return detail::Xor(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask128 Not(const Mask128 m) { + // Flip only the valid bits. + // TODO(janwas): use _knot intrinsics if N >= 8. + return Xor(m, Mask128::FromBits((1ull << N) - 1)); +} + +template +HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { + return detail::ExclusiveNeither(hwy::SizeTag(), a, b); +} + +#else // AVX2 or below + +// ------------------------------ Mask + +// Mask and Vec are the same (true = FF..FF). +template +HWY_API Mask128 MaskFromVec(const Vec128 v) { + return Mask128{v.raw}; +} + +template +HWY_API Vec128 VecFromMask(const Mask128 v) { + return Vec128{v.raw}; +} + +template +HWY_API Vec128 VecFromMask(const Simd /* tag */, + const Mask128 v) { + return Vec128{v.raw}; +} + +#if HWY_TARGET == HWY_SSSE3 + +// mask ? yes : no +template +HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, + Vec128 no) { + const auto vmask = VecFromMask(DFromV(), mask); + return Or(And(vmask, yes), AndNot(vmask, no)); +} + +#else // HWY_TARGET == HWY_SSSE3 + +// mask ? yes : no +template +HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, + Vec128 no) { + return Vec128{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)}; +} +template +HWY_API Vec128 IfThenElse(const Mask128 mask, + const Vec128 yes, + const Vec128 no) { + return Vec128{_mm_blendv_ps(no.raw, yes.raw, mask.raw)}; +} +template +HWY_API Vec128 IfThenElse(const Mask128 mask, + const Vec128 yes, + const Vec128 no) { + return Vec128{_mm_blendv_pd(no.raw, yes.raw, mask.raw)}; +} + +#endif // HWY_TARGET == HWY_SSSE3 + +// mask ? yes : 0 +template +HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { + return yes & VecFromMask(DFromV(), mask); +} + +// mask ? 0 : no +template +HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { + return AndNot(VecFromMask(DFromV(), mask), no); +} + +// ------------------------------ Mask logical + +template +HWY_API Mask128 Not(const Mask128 m) { + return MaskFromVec(Not(VecFromMask(Simd(), m))); +} + +template +HWY_API Mask128 And(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Or(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { + const Simd d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ ShiftLeft + +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{_mm_slli_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{_mm_slli_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{_mm_slli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{_mm_slli_epi16(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{_mm_slli_epi32(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{_mm_slli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + +// ------------------------------ ShiftRight + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{_mm_srli_epi16(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{_mm_srli_epi32(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{_mm_srli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRight(Vec128{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{_mm_srai_epi16(v.raw, kBits)}; +} +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{_mm_srai_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const DFromV di; + const RebindToUnsigned du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// i64 is implemented after BroadcastSignBit. + +// ================================================== SWIZZLE (1) + +// ------------------------------ TableLookupBytes +template +HWY_API Vec128 TableLookupBytes(const Vec128 bytes, + const Vec128 from) { + return Vec128{_mm_shuffle_epi8(bytes.raw, from.raw)}; +} + +// ------------------------------ TableLookupBytesOr0 +// For all vector widths; x86 anyway zeroes if >= 0x80. +template +HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { + return TableLookupBytes(bytes, from); +} + +// ------------------------------ Shuffles (ShiftRight, TableLookupBytes) + +// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template +HWY_API Vec128 Shuffle2301(const Vec128 v) { + static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{_mm_shuffle_epi32(v.raw, 0xB1)}; +} +template +HWY_API Vec128 Shuffle2301(const Vec128 v) { + static_assert(N == 2 || N == 4, "Does not make sense for N=1"); + return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0xB1)}; +} + +// These are used by generic_ops-inl to implement LoadInterleaved3. As with +// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output +// comes from the first argument. +namespace detail { + +template +HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { + const Twice> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {1, 0, 7, 6}; + return Vec128{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +} +template +HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { + const Twice> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c}; + return Vec128{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +} +template +HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { + const DFromV d; + const RebindToFloat df; + constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); + return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +template +HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { + const Twice> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {0, 3, 6, 5}; + return Vec128{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +} +template +HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { + const Twice> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a}; + return Vec128{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +} +template +HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { + const DFromV d; + const RebindToFloat df; + constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); + return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +template +HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { + const Twice> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {2, 1, 4, 7}; + return Vec128{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +} +template +HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { + const Twice> d2; + const auto ba = Combine(d2, b, a); + alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e}; + return Vec128{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; +} +template +HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { + const DFromV d; + const RebindToFloat df; + constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); + return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +} // namespace detail + +// Swap 64-bit halves +HWY_API Vec128 Shuffle1032(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128 Shuffle1032(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128 Shuffle1032(const Vec128 v) { + return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x4E)}; +} +HWY_API Vec128 Shuffle01(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128 Shuffle01(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec128 Shuffle01(const Vec128 v) { + return Vec128{_mm_shuffle_pd(v.raw, v.raw, 1)}; +} + +// Rotate right 32 bits +HWY_API Vec128 Shuffle0321(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec128 Shuffle0321(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec128 Shuffle0321(const Vec128 v) { + return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x39)}; +} +// Rotate left 32 bits +HWY_API Vec128 Shuffle2103(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec128 Shuffle2103(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec128 Shuffle2103(const Vec128 v) { + return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x93)}; +} + +// Reverse +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec128 Shuffle0123(const Vec128 v) { + return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x1B)}; +} + +// ================================================== COMPARE + +#if HWY_TARGET <= HWY_AVX3 + +// Comparisons set a mask bit to 1 if the condition is true, else 0. + +template +HWY_API Mask128 RebindMask(Simd /*tag*/, + Mask128 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask128{m.raw}; +} + +namespace detail { + +template +HWY_INLINE Mask128 TestBit(hwy::SizeTag<1> /*tag*/, const Vec128 v, + const Vec128 bit) { + return Mask128{_mm_test_epi8_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask128 TestBit(hwy::SizeTag<2> /*tag*/, const Vec128 v, + const Vec128 bit) { + return Mask128{_mm_test_epi16_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask128 TestBit(hwy::SizeTag<4> /*tag*/, const Vec128 v, + const Vec128 bit) { + return Mask128{_mm_test_epi32_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask128 TestBit(hwy::SizeTag<8> /*tag*/, const Vec128 v, + const Vec128 bit) { + return Mask128{_mm_test_epi64_mask(v.raw, bit.raw)}; +} + +} // namespace detail + +template +HWY_API Mask128 TestBit(const Vec128 v, const Vec128 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return detail::TestBit(hwy::SizeTag(), v, bit); +} + +// ------------------------------ Equality + +template +HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpeq_epi8_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpeq_epi16_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpeq_epi32_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpeq_epi64_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator==(Vec128 a, Vec128 b) { + return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +template +HWY_API Mask128 operator==(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template +HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpneq_epi8_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpneq_epi16_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpneq_epi32_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { + return Mask128{_mm_cmpneq_epi64_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { + return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +// Signed/float < +template +HWY_API Mask128 operator>(Vec128 a, Vec128 b) { + return Mask128{_mm_cmpgt_epi8_mask(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epi16_mask(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epi32_mask(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epi64_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator>(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epu8_mask(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epu16_mask(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epu32_mask(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epu64_mask(a.raw, b.raw)}; +} + +template +HWY_API Mask128 operator>(Vec128 a, Vec128 b) { + return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} +template +HWY_API Mask128 operator>(Vec128 a, Vec128 b) { + return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} + +// ------------------------------ Weak inequality + +template +HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { + return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} +template +HWY_API Mask128 operator>=(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} + +// ------------------------------ Mask + +namespace detail { + +template +HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<1> /*tag*/, + const Vec128 v) { + return Mask128{_mm_movepi8_mask(v.raw)}; +} +template +HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<2> /*tag*/, + const Vec128 v) { + return Mask128{_mm_movepi16_mask(v.raw)}; +} +template +HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<4> /*tag*/, + const Vec128 v) { + return Mask128{_mm_movepi32_mask(v.raw)}; +} +template +HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<8> /*tag*/, + const Vec128 v) { + return Mask128{_mm_movepi64_mask(v.raw)}; +} + +} // namespace detail + +template +HWY_API Mask128 MaskFromVec(const Vec128 v) { + return detail::MaskFromVec(hwy::SizeTag(), v); +} +// There do not seem to be native floating-point versions of these instructions. +template +HWY_API Mask128 MaskFromVec(const Vec128 v) { + const RebindToSigned> di; + return Mask128{MaskFromVec(BitCast(di, v)).raw}; +} +template +HWY_API Mask128 MaskFromVec(const Vec128 v) { + const RebindToSigned> di; + return Mask128{MaskFromVec(BitCast(di, v)).raw}; +} + +template +HWY_API Vec128 VecFromMask(const Mask128 v) { + return Vec128{_mm_movm_epi8(v.raw)}; +} + +template +HWY_API Vec128 VecFromMask(const Mask128 v) { + return Vec128{_mm_movm_epi16(v.raw)}; +} + +template +HWY_API Vec128 VecFromMask(const Mask128 v) { + return Vec128{_mm_movm_epi32(v.raw)}; +} + +template +HWY_API Vec128 VecFromMask(const Mask128 v) { + return Vec128{_mm_movm_epi64(v.raw)}; +} + +template +HWY_API Vec128 VecFromMask(const Mask128 v) { + return Vec128{_mm_castsi128_ps(_mm_movm_epi32(v.raw))}; +} + +template +HWY_API Vec128 VecFromMask(const Mask128 v) { + return Vec128{_mm_castsi128_pd(_mm_movm_epi64(v.raw))}; +} + +template +HWY_API Vec128 VecFromMask(Simd /* tag */, + const Mask128 v) { + return VecFromMask(v); +} + +#else // AVX2 or below + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template +HWY_API Mask128 RebindMask(Simd /*tag*/, + Mask128 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + const Simd d; + return MaskFromVec(BitCast(Simd(), VecFromMask(d, m))); +} + +template +HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +// Unsigned +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpeq_epi8(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpeq_epi16(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpeq_epi32(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + const Simd d32; + const Simd d64; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); +#else + return Mask128{_mm_cmpeq_epi64(a.raw, b.raw)}; +#endif +} + +// Signed +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpeq_epi8(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpeq_epi16(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpeq_epi32(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + // Same as signed ==; avoid duplicating the SSSE3 version. + const DFromV d; + RebindToUnsigned du; + return RebindMask(d, BitCast(du, a) == BitCast(du, b)); +} + +// Float +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpeq_ps(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator==(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpeq_pd(a.raw, b.raw)}; +} + +// ------------------------------ Inequality + +// This cannot have T as a template argument, otherwise it is not more +// specialized than rewritten operator== in C++20, leading to compile +// errors: https://gcc.godbolt.org/z/xsrPhPvPT. +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} +template +HWY_API Mask128 operator!=(Vec128 a, + Vec128 b) { + return Not(a == b); +} + +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpneq_ps(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator!=(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpneq_pd(a.raw, b.raw)}; +} + +// ------------------------------ Strict inequality + +namespace detail { + +template +HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epi8(a.raw, b.raw)}; +} +template +HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epi16(a.raw, b.raw)}; +} +template +HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_epi32(a.raw, b.raw)}; +} + +template +HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, + const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + // See https://stackoverflow.com/questions/65166174/: + const Simd d; + const RepartitionToNarrow d32; + const Vec128 m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw}; + const Vec128 m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw}; + // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper: + // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0. + const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw; + // Duplicate upper to lower half. + return Mask128{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))}; +#else + return Mask128{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2 +#endif +} + +template +HWY_INLINE Mask128 Gt(hwy::UnsignedTag /*tag*/, Vec128 a, + Vec128 b) { + const DFromV du; + const RebindToSigned di; + const Vec128 msb = Set(du, (LimitsMax() >> 1) + 1); + const auto sa = BitCast(di, Xor(a, msb)); + const auto sb = BitCast(di, Xor(b, msb)); + return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); +} + +template +HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_ps(a.raw, b.raw)}; +} +template +HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, + Vec128 b) { + return Mask128{_mm_cmpgt_pd(a.raw, b.raw)}; +} + +} // namespace detail + +template +HWY_INLINE Mask128 operator>(Vec128 a, Vec128 b) { + return detail::Gt(hwy::TypeTag(), a, b); +} + +// ------------------------------ Weak inequality + +template +HWY_API Mask128 operator>=(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpge_ps(a.raw, b.raw)}; +} +template +HWY_API Mask128 operator>=(const Vec128 a, + const Vec128 b) { + return Mask128{_mm_cmpge_pd(a.raw, b.raw)}; +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ Reversed comparisons + +template +HWY_API Mask128 operator<(Vec128 a, Vec128 b) { + return b > a; +} + +template +HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { + return b >= a; +} + +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask128 FirstN(const Simd d, size_t num) { +#if HWY_TARGET <= HWY_AVX3 + (void)d; + const uint64_t all = (1ull << N) - 1; + // BZHI only looks at the lower 8 bits of num! + const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num); + return Mask128::FromBits(bits); +#else + const RebindToSigned di; // Signed comparisons are cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); +#endif +} + +template +using MFromD = decltype(FirstN(D(), 0)); + +// ================================================== MEMORY (1) + +// Clang static analysis claims the memory immediately after a partial vector +// store is uninitialized, and also flags the input to partial loads (at least +// for loadl_pd) as "garbage". This is a false alarm because msan does not +// raise errors. We work around this by using CopyBytes instead of intrinsics, +// but only for the analyzer to avoid potentially bad code generation. +// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7. +#ifndef HWY_SAFE_PARTIAL_LOAD_STORE +#if defined(__clang_analyzer__) || \ + (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) +#define HWY_SAFE_PARTIAL_LOAD_STORE 1 +#else +#define HWY_SAFE_PARTIAL_LOAD_STORE 0 +#endif +#endif // HWY_SAFE_PARTIAL_LOAD_STORE + +// ------------------------------ Load + +template +HWY_API Vec128 Load(Full128 /* tag */, const T* HWY_RESTRICT aligned) { + return Vec128{_mm_load_si128(reinterpret_cast(aligned))}; +} +HWY_API Vec128 Load(Full128 /* tag */, + const float* HWY_RESTRICT aligned) { + return Vec128{_mm_load_ps(aligned)}; +} +HWY_API Vec128 Load(Full128 /* tag */, + const double* HWY_RESTRICT aligned) { + return Vec128{_mm_load_pd(aligned)}; +} + +template +HWY_API Vec128 LoadU(Full128 /* tag */, const T* HWY_RESTRICT p) { + return Vec128{_mm_loadu_si128(reinterpret_cast(p))}; +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const float* HWY_RESTRICT p) { + return Vec128{_mm_loadu_ps(p)}; +} +HWY_API Vec128 LoadU(Full128 /* tag */, + const double* HWY_RESTRICT p) { + return Vec128{_mm_loadu_pd(p)}; +} + +template +HWY_API Vec64 Load(Full64 /* tag */, const T* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128i v = _mm_setzero_si128(); + CopyBytes<8>(p, &v); // not same size + return Vec64{v}; +#else + return Vec64{_mm_loadl_epi64(reinterpret_cast(p))}; +#endif +} + +HWY_API Vec128 Load(Full64 /* tag */, + const float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128 v = _mm_setzero_ps(); + CopyBytes<8>(p, &v); // not same size + return Vec128{v}; +#else + const __m128 hi = _mm_setzero_ps(); + return Vec128{_mm_loadl_pi(hi, reinterpret_cast(p))}; +#endif +} + +HWY_API Vec64 Load(Full64 /* tag */, + const double* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128d v = _mm_setzero_pd(); + CopyBytes<8>(p, &v); // not same size + return Vec64{v}; +#else + return Vec64{_mm_load_sd(p)}; +#endif +} + +HWY_API Vec128 Load(Full32 /* tag */, + const float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128 v = _mm_setzero_ps(); + CopyBytes<4>(p, &v); // not same size + return Vec128{v}; +#else + return Vec128{_mm_load_ss(p)}; +#endif +} + +// Any <= 32 bit except +template +HWY_API Vec128 Load(Simd /* tag */, const T* HWY_RESTRICT p) { + constexpr size_t kSize = sizeof(T) * N; +#if HWY_SAFE_PARTIAL_LOAD_STORE + __m128 v = _mm_setzero_ps(); + CopyBytes(p, &v); // not same size + return Vec128{v}; +#else + int32_t bits = 0; + CopyBytes(p, &bits); // not same size + return Vec128{_mm_cvtsi32_si128(bits)}; +#endif +} + +// For < 128 bit, LoadU == Load. +template +HWY_API Vec128 LoadU(Simd d, const T* HWY_RESTRICT p) { + return Load(d, p); +} + +// 128-bit SIMD => nothing to duplicate, same as an unaligned load. +template +HWY_API Vec128 LoadDup128(Simd d, const T* HWY_RESTRICT p) { + return LoadU(d, p); +} + +// Returns a vector with lane i=[0, N) set to "first" + i. +template +HWY_API Vec128 Iota(const Simd d, const T2 first) { + HWY_ALIGN T lanes[16 / sizeof(T)]; + for (size_t i = 0; i < 16 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + +// ------------------------------ MaskedLoad + +#if HWY_TARGET <= HWY_AVX3 + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd /* tag */, + const T* HWY_RESTRICT p) { + return Vec128{_mm_maskz_loadu_epi8(m.raw, p)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd /* tag */, + const T* HWY_RESTRICT p) { + return Vec128{_mm_maskz_loadu_epi16(m.raw, p)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd /* tag */, + const T* HWY_RESTRICT p) { + return Vec128{_mm_maskz_loadu_epi32(m.raw, p)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd /* tag */, + const T* HWY_RESTRICT p) { + return Vec128{_mm_maskz_loadu_epi64(m.raw, p)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, + Simd /* tag */, + const float* HWY_RESTRICT p) { + return Vec128{_mm_maskz_loadu_ps(m.raw, p)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, + Simd /* tag */, + const double* HWY_RESTRICT p) { + return Vec128{_mm_maskz_loadu_pd(m.raw, p)}; +} + +#elif HWY_TARGET == HWY_AVX2 + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd /* tag */, + const T* HWY_RESTRICT p) { + auto p_p = reinterpret_cast(p); // NOLINT + return Vec128{_mm_maskload_epi32(p_p, m.raw)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd /* tag */, + const T* HWY_RESTRICT p) { + auto p_p = reinterpret_cast(p); // NOLINT + return Vec128{_mm_maskload_epi64(p_p, m.raw)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, + const float* HWY_RESTRICT p) { + const Vec128 mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + return Vec128{_mm_maskload_ps(p, mi.raw)}; +} + +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, + const double* HWY_RESTRICT p) { + const Vec128 mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + return Vec128{_mm_maskload_pd(p, mi.raw)}; +} + +// There is no maskload_epi8/16, so blend instead. +template * = nullptr> +HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, + const T* HWY_RESTRICT p) { + return IfThenElseZero(m, Load(d, p)); +} + +#else // <= SSE4 + +// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). +template +HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, + const T* HWY_RESTRICT p) { + return IfThenElseZero(m, Load(d, p)); +} + +#endif + +// ------------------------------ Store + +template +HWY_API void Store(Vec128 v, Full128 /* tag */, T* HWY_RESTRICT aligned) { + _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); +} +HWY_API void Store(const Vec128 v, Full128 /* tag */, + float* HWY_RESTRICT aligned) { + _mm_store_ps(aligned, v.raw); +} +HWY_API void Store(const Vec128 v, Full128 /* tag */, + double* HWY_RESTRICT aligned) { + _mm_store_pd(aligned, v.raw); +} + +template +HWY_API void StoreU(Vec128 v, Full128 /* tag */, T* HWY_RESTRICT p) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + float* HWY_RESTRICT p) { + _mm_storeu_ps(p, v.raw); +} +HWY_API void StoreU(const Vec128 v, Full128 /* tag */, + double* HWY_RESTRICT p) { + _mm_storeu_pd(p, v.raw); +} + +template +HWY_API void Store(Vec64 v, Full64 /* tag */, T* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<8>(&v, p); // not same size +#else + _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw); +#endif +} +HWY_API void Store(const Vec128 v, Full64 /* tag */, + float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<8>(&v, p); // not same size +#else + _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw); +#endif +} +HWY_API void Store(const Vec64 v, Full64 /* tag */, + double* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<8>(&v, p); // not same size +#else + _mm_storel_pd(p, v.raw); +#endif +} + +// Any <= 32 bit except +template +HWY_API void Store(Vec128 v, Simd /* tag */, T* HWY_RESTRICT p) { + CopyBytes(&v, p); // not same size +} +HWY_API void Store(const Vec128 v, Full32 /* tag */, + float* HWY_RESTRICT p) { +#if HWY_SAFE_PARTIAL_LOAD_STORE + CopyBytes<4>(&v, p); // not same size +#else + _mm_store_ss(p, v.raw); +#endif +} + +// For < 128 bit, StoreU == Store. +template +HWY_API void StoreU(const Vec128 v, Simd d, T* HWY_RESTRICT p) { + Store(v, d, p); +} + +// ------------------------------ BlendedStore + +namespace detail { + +// There is no maskload_epi8/16 with which we could safely implement +// BlendedStore. Manual blending is also unsafe because loading a full vector +// that crosses the array end causes asan faults. Resort to scalar code; the +// caller should instead use memcpy, assuming m is FirstN(d, n). +template +HWY_API void ScalarMaskedStore(Vec128 v, Mask128 m, Simd d, + T* HWY_RESTRICT p) { + const RebindToSigned di; // for testing mask if T=bfloat16_t. + using TI = TFromD; + alignas(16) TI buf[N]; + alignas(16) TI mask[N]; + Store(BitCast(di, v), di, buf); + Store(BitCast(di, VecFromMask(d, m)), di, mask); + for (size_t i = 0; i < N; ++i) { + if (mask[i]) { + CopySameSize(buf + i, p + i); + } + } +} +} // namespace detail + +#if HWY_TARGET <= HWY_AVX3 + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd /* tag */, T* HWY_RESTRICT p) { + _mm_mask_storeu_epi8(p, m.raw, v.raw); +} +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd /* tag */, T* HWY_RESTRICT p) { + _mm_mask_storeu_epi16(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd /* tag */, T* HWY_RESTRICT p) { + auto pi = reinterpret_cast(p); // NOLINT + _mm_mask_storeu_epi32(pi, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd /* tag */, T* HWY_RESTRICT p) { + auto pi = reinterpret_cast(p); // NOLINT + _mm_mask_storeu_epi64(pi, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd, float* HWY_RESTRICT p) { + _mm_mask_storeu_ps(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd, double* HWY_RESTRICT p) { + _mm_mask_storeu_pd(p, m.raw, v.raw); +} + +#elif HWY_TARGET == HWY_AVX2 + +template * = nullptr> +HWY_API void BlendedStore(Vec128 v, Mask128 m, Simd d, + T* HWY_RESTRICT p) { + detail::ScalarMaskedStore(v, m, d, p); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd /* tag */, T* HWY_RESTRICT p) { + // For partial vectors, avoid writing other lanes by zeroing their mask. + if (N < 4) { + const Full128 df; + const Mask128 mf{m.raw}; + m = Mask128{And(mf, FirstN(df, N)).raw}; + } + + auto pi = reinterpret_cast(p); // NOLINT + _mm_maskstore_epi32(pi, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd /* tag */, T* HWY_RESTRICT p) { + // For partial vectors, avoid writing other lanes by zeroing their mask. + if (N < 2) { + const Full128 df; + const Mask128 mf{m.raw}; + m = Mask128{And(mf, FirstN(df, N)).raw}; + } + + auto pi = reinterpret_cast(p); // NOLINT + _mm_maskstore_epi64(pi, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd d, float* HWY_RESTRICT p) { + using T = float; + // For partial vectors, avoid writing other lanes by zeroing their mask. + if (N < 4) { + const Full128 df; + const Mask128 mf{m.raw}; + m = Mask128{And(mf, FirstN(df, N)).raw}; + } + + const Vec128, N> mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + _mm_maskstore_ps(p, mi.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, + Simd d, double* HWY_RESTRICT p) { + using T = double; + // For partial vectors, avoid writing other lanes by zeroing their mask. + if (N < 2) { + const Full128 df; + const Mask128 mf{m.raw}; + m = Mask128{And(mf, FirstN(df, N)).raw}; + } + + const Vec128, N> mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + _mm_maskstore_pd(p, mi.raw, v.raw); +} + +#else // <= SSE4 + +template +HWY_API void BlendedStore(Vec128 v, Mask128 m, Simd d, + T* HWY_RESTRICT p) { + // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). + detail::ScalarMaskedStore(v, m, d, p); +} + +#endif // SSE4 + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi32(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi64(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi32(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_epi64(a.raw, b.raw)}; +} + +// Float +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator+(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_add_pd(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(Vec128 a, + Vec128 b) { + return Vec128{_mm_sub_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_epi32(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_epi64(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_epi32(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_epi64(a.raw, b.raw)}; +} + +// Float +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator-(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_sub_pd(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +template +HWY_API Vec128 SumsOf8(const Vec128 v) { + return Vec128{_mm_sad_epu8(v.raw, _mm_setzero_si128())}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_adds_epu8(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_adds_epu16(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_adds_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_adds_epi16(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_subs_epu8(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_subs_epu16(a.raw, b.raw)}; +} + +// Signed +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_subs_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_subs_epi16(a.raw, b.raw)}; +} + +// ------------------------------ AverageRound + +// Returns (a + b + 1) / 2 + +// Unsigned +template +HWY_API Vec128 AverageRound(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_avg_epu8(a.raw, b.raw)}; +} +template +HWY_API Vec128 AverageRound(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_avg_epu16(a.raw, b.raw)}; +} + +// ------------------------------ Integer multiplication + +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mullo_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mullo_epi16(a.raw, b.raw)}; +} + +// Returns the upper 16 bits of a * b in each lane. +template +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mulhi_epu16(a.raw, b.raw)}; +} +template +HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mulhi_epi16(a.raw, b.raw)}; +} + +template +HWY_API Vec128 MulFixedPoint15(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mulhrs_epi16(a.raw, b.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +template +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mul_epu32(a.raw, b.raw)}; +} + +#if HWY_TARGET == HWY_SSSE3 + +template // N=1 or 2 +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + return Set(Simd(), + static_cast(GetLane(a)) * GetLane(b)); +} +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + alignas(16) int32_t a_lanes[4]; + alignas(16) int32_t b_lanes[4]; + const Full128 di32; + Store(a, di32, a_lanes); + Store(b, di32, b_lanes); + alignas(16) int64_t mul[2]; + mul[0] = static_cast(a_lanes[0]) * b_lanes[0]; + mul[1] = static_cast(a_lanes[2]) * b_lanes[2]; + return Load(Full128(), mul); +} + +#else // HWY_TARGET == HWY_SSSE3 + +template +HWY_API Vec128 MulEven(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mul_epi32(a.raw, b.raw)}; +} + +#endif // HWY_TARGET == HWY_SSSE3 + +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency. + // 64-bit right shift would also work but also needs port 5, so no benefit. + // Notation: x=don't care, z=0. + const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1)); + const auto mullo_x2x0 = MulEven(a, b); + const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1)); + const auto mullo_x3x1 = + MulEven(Vec128{a_x3x1}, Vec128{b_x3x1}); + // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating + // the latter requires one more instruction or a constant. + const __m128i mul_20 = + _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0)); + const __m128i mul_31 = + _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0)); + return Vec128{_mm_unpacklo_epi32(mul_20, mul_31)}; +#else + return Vec128{_mm_mullo_epi32(a.raw, b.raw)}; +#endif +} + +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + // Same as unsigned; avoid duplicating the SSSE3 code. + const DFromV d; + const RebindToUnsigned du; + return BitCast(d, BitCast(du, a) * BitCast(du, b)); +} + +// ------------------------------ RotateRight (ShiftRight, Or) + +template +HWY_API Vec128 RotateRight(const Vec128 v) { + static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_ror_epi32(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +#endif +} + +template +HWY_API Vec128 RotateRight(const Vec128 v) { + static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_ror_epi64(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +#endif +} + +// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) + +template +HWY_API Vec128 BroadcastSignBit(const Vec128 v) { + const DFromV d; + return VecFromMask(v < Zero(d)); +} + +template +HWY_API Vec128 BroadcastSignBit(const Vec128 v) { + return ShiftRight<15>(v); +} + +template +HWY_API Vec128 BroadcastSignBit(const Vec128 v) { + return ShiftRight<31>(v); +} + +template +HWY_API Vec128 BroadcastSignBit(const Vec128 v) { + const DFromV d; +#if HWY_TARGET <= HWY_AVX3 + (void)d; + return Vec128{_mm_srai_epi64(v.raw, 63)}; +#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4 + return VecFromMask(v < Zero(d)); +#else + // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift + // avoids generating a zero. + const RepartitionToNarrow d32; + const auto sign = ShiftRight<31>(BitCast(d32, v)); + return Vec128{ + _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +#endif +} + +template +HWY_API Vec128 Abs(const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_abs_epi64(v.raw)}; +#else + const auto zero = Zero(DFromV()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_srai_epi64(v.raw, kBits)}; +#else + const DFromV di; + const RebindToUnsigned du; + const auto right = BitCast(di, ShiftRight(BitCast(du, v))); + const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); + return right | sign; +#endif +} + +// ------------------------------ ZeroIfNegative (BroadcastSignBit) +template +HWY_API Vec128 ZeroIfNegative(Vec128 v) { + static_assert(IsFloat(), "Only works for float"); + const DFromV d; +#if HWY_TARGET == HWY_SSSE3 + const RebindToSigned di; + const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); +#else + const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS +#endif + return IfThenElse(mask, Zero(d), v); +} + +// ------------------------------ IfNegativeThenElse +template +HWY_API Vec128 IfNegativeThenElse(const Vec128 v, + const Vec128 yes, + const Vec128 no) { + // int8: IfThenElse only looks at the MSB. + return IfThenElse(MaskFromVec(v), yes, no); +} + +template +HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, + Vec128 no) { + static_assert(IsSigned(), "Only works for signed/float"); + const DFromV d; + const RebindToSigned di; + + // 16-bit: no native blendv, so copy sign to lower byte's MSB. + v = BitCast(d, BroadcastSignBit(BitCast(di, v))); + return IfThenElse(MaskFromVec(v), yes, no); +} + +template +HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, + Vec128 no) { + static_assert(IsSigned(), "Only works for signed/float"); + const DFromV d; + const RebindToFloat df; + + // 32/64-bit: use float IfThenElse, which only looks at the MSB. + return BitCast(d, IfThenElse(MaskFromVec(BitCast(df, v)), BitCast(df, yes), + BitCast(df, no))); +} + +// ------------------------------ ShiftLeftSame + +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftLeftSame(Vec128>{v.raw}, bits).raw}; + return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); +} + +// ------------------------------ ShiftRightSame (BroadcastSignBit) + +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec128 ShiftRightSame(Vec128 v, + const int bits) { + const DFromV d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRightSame(Vec128{v.raw}, bits).raw}; + return shifted & Set(d8, static_cast(0xFF >> bits)); +} + +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +template +HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +#else + const DFromV di; + const RebindToUnsigned du; + const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); + return right | sign; +#endif +} + +template +HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { + const DFromV di; + const RebindToUnsigned du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = + BitCast(di, Set(du, static_cast(0x80 >> bits))); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ Floating-point mul / div + +template +HWY_API Vec128 operator*(Vec128 a, Vec128 b) { + return Vec128{_mm_mul_ps(a.raw, b.raw)}; +} +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mul_ss(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator*(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_mul_pd(a.raw, b.raw)}; +} +HWY_API Vec64 operator*(const Vec64 a, const Vec64 b) { + return Vec64{_mm_mul_sd(a.raw, b.raw)}; +} + +template +HWY_API Vec128 operator/(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_div_ps(a.raw, b.raw)}; +} +HWY_API Vec128 operator/(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_div_ss(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator/(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_div_pd(a.raw, b.raw)}; +} +HWY_API Vec64 operator/(const Vec64 a, const Vec64 b) { + return Vec64{_mm_div_sd(a.raw, b.raw)}; +} + +// Approximate reciprocal +template +HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { + return Vec128{_mm_rcp_ps(v.raw)}; +} +HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { + return Vec128{_mm_rcp_ss(v.raw)}; +} + +// Absolute value of difference. +template +HWY_API Vec128 AbsDiff(const Vec128 a, + const Vec128 b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +template +HWY_API Vec128 MulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return mul * x + add; +#else + return Vec128{_mm_fmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +template +HWY_API Vec128 MulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return mul * x + add; +#else + return Vec128{_mm_fmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns add - mul * x +template +HWY_API Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return add - mul * x; +#else + return Vec128{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +template +HWY_API Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return add - mul * x; +#else + return Vec128{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns mul * x - sub +template +HWY_API Vec128 MulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return mul * x - sub; +#else + return Vec128{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +template +HWY_API Vec128 MulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return mul * x - sub; +#else + return Vec128{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// Returns -mul * x - sub +template +HWY_API Vec128 NegMulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return Neg(mul) * x - sub; +#else + return Vec128{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +template +HWY_API Vec128 NegMulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return Neg(mul) * x - sub; +#else + return Vec128{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// ------------------------------ Floating-point square root + +// Full precision square root +template +HWY_API Vec128 Sqrt(const Vec128 v) { + return Vec128{_mm_sqrt_ps(v.raw)}; +} +HWY_API Vec128 Sqrt(const Vec128 v) { + return Vec128{_mm_sqrt_ss(v.raw)}; +} +template +HWY_API Vec128 Sqrt(const Vec128 v) { + return Vec128{_mm_sqrt_pd(v.raw)}; +} +HWY_API Vec64 Sqrt(const Vec64 v) { + return Vec64{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)}; +} + +// Approximate reciprocal square root +template +HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { + return Vec128{_mm_rsqrt_ps(v.raw)}; +} +HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { + return Vec128{_mm_rsqrt_ss(v.raw)}; +} + +// ------------------------------ Min (Gt, IfThenElse) + +namespace detail { + +template +HWY_INLINE HWY_MAYBE_UNUSED Vec128 MinU(const Vec128 a, + const Vec128 b) { + const DFromV d; + const RebindToUnsigned du; + const RebindToSigned di; + const auto msb = Set(du, static_cast(T(1) << (sizeof(T) * 8 - 1))); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, b, a); +} + +} // namespace detail + +// Unsigned +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_min_epu8(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return detail::MinU(a, b); +#else + return Vec128{_mm_min_epu16(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return detail::MinU(a, b); +#else + return Vec128{_mm_min_epu32(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_min_epu64(a.raw, b.raw)}; +#else + return detail::MinU(a, b); +#endif +} + +// Signed +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return IfThenElse(a < b, a, b); +#else + return Vec128{_mm_min_epi8(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_min_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return IfThenElse(a < b, a, b); +#else + return Vec128{_mm_min_epi32(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_min_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, a, b); +#endif +} + +// Float +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_min_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 Min(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_min_pd(a.raw, b.raw)}; +} + +// ------------------------------ Max (Gt, IfThenElse) + +namespace detail { +template +HWY_INLINE HWY_MAYBE_UNUSED Vec128 MaxU(const Vec128 a, + const Vec128 b) { + const DFromV d; + const RebindToUnsigned du; + const RebindToSigned di; + const auto msb = Set(du, static_cast(T(1) << (sizeof(T) * 8 - 1))); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, a, b); +} + +} // namespace detail + +// Unsigned +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_max_epu8(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return detail::MaxU(a, b); +#else + return Vec128{_mm_max_epu16(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return detail::MaxU(a, b); +#else + return Vec128{_mm_max_epu32(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_max_epu64(a.raw, b.raw)}; +#else + return detail::MaxU(a, b); +#endif +} + +// Signed +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return IfThenElse(a < b, b, a); +#else + return Vec128{_mm_max_epi8(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_max_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + return IfThenElse(a < b, b, a); +#else + return Vec128{_mm_max_epi32(a.raw, b.raw)}; +#endif +} +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_max_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, b, a); +#endif +} + +// Float +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_max_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 Max(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_max_pd(a.raw, b.raw)}; +} + +// ================================================== MEMORY (2) + +// ------------------------------ Non-temporal stores + +// On clang6, we see incorrect code generated for _mm_stream_pi, so +// round even partial vectors up to 16 bytes. +template +HWY_API void Stream(Vec128 v, Simd /* tag */, + T* HWY_RESTRICT aligned) { + _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw); +} +template +HWY_API void Stream(const Vec128 v, Simd /* tag */, + float* HWY_RESTRICT aligned) { + _mm_stream_ps(aligned, v.raw); +} +template +HWY_API void Stream(const Vec128 v, Simd /* tag */, + double* HWY_RESTRICT aligned) { + _mm_stream_pd(aligned, v.raw); +} + +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +// Unfortunately the GCC/Clang intrinsics do not accept int64_t*. +using GatherIndex64 = long long int; // NOLINT(runtime/int) +static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type"); + +#if HWY_TARGET <= HWY_AVX3 +namespace detail { + +template +HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 4) { + _mm_i32scatter_epi32(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 index) { + if (N == 4) { + _mm_i32scatter_epi32(base, index.raw, v.raw, 4); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4); + } +} + +template +HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 2) { + _mm_i64scatter_epi64(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 index) { + if (N == 2) { + _mm_i64scatter_epi64(base, index.raw, v.raw, 8); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8); + } +} + +} // namespace detail + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, + T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); +} +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); +} + +template +HWY_API void ScatterOffset(Vec128 v, Simd /* tag */, + float* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 4) { + _mm_i32scatter_ps(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_API void ScatterIndex(Vec128 v, Simd /* tag */, + float* HWY_RESTRICT base, + const Vec128 index) { + if (N == 4) { + _mm_i32scatter_ps(base, index.raw, v.raw, 4); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4); + } +} + +template +HWY_API void ScatterOffset(Vec128 v, Simd /* tag */, + double* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 2) { + _mm_i64scatter_pd(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_API void ScatterIndex(Vec128 v, Simd /* tag */, + double* HWY_RESTRICT base, + const Vec128 index) { + if (N == 2) { + _mm_i64scatter_pd(base, index.raw, v.raw, 8); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8); + } +} +#else // HWY_TARGET <= HWY_AVX3 + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, + T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Rebind(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Index index_lanes[N]; + Store(index, Rebind(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +#endif + +// ------------------------------ Gather (Load/Store) + +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + +template +HWY_API Vec128 GatherOffset(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Rebind(), offset_lanes); + + alignas(16) T lanes[N]; + const uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); +} + +template +HWY_API Vec128 GatherIndex(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) Index index_lanes[N]; + Store(index, Rebind(), index_lanes); + + alignas(16) T lanes[N]; + for (size_t i = 0; i < N; ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); +} + +#else + +namespace detail { + +template +HWY_INLINE Vec128 GatherOffset(hwy::SizeTag<4> /* tag */, + Simd /* d */, + const T* HWY_RESTRICT base, + const Vec128 offset) { + return Vec128{_mm_i32gather_epi32( + reinterpret_cast(base), offset.raw, 1)}; +} +template +HWY_INLINE Vec128 GatherIndex(hwy::SizeTag<4> /* tag */, + Simd /* d */, + const T* HWY_RESTRICT base, + const Vec128 index) { + return Vec128{_mm_i32gather_epi32( + reinterpret_cast(base), index.raw, 4)}; +} + +template +HWY_INLINE Vec128 GatherOffset(hwy::SizeTag<8> /* tag */, + Simd /* d */, + const T* HWY_RESTRICT base, + const Vec128 offset) { + return Vec128{_mm_i64gather_epi64( + reinterpret_cast(base), offset.raw, 1)}; +} +template +HWY_INLINE Vec128 GatherIndex(hwy::SizeTag<8> /* tag */, + Simd /* d */, + const T* HWY_RESTRICT base, + const Vec128 index) { + return Vec128{_mm_i64gather_epi64( + reinterpret_cast(base), index.raw, 8)}; +} + +} // namespace detail + +template +HWY_API Vec128 GatherOffset(Simd d, const T* HWY_RESTRICT base, + const Vec128 offset) { + return detail::GatherOffset(hwy::SizeTag(), d, base, offset); +} +template +HWY_API Vec128 GatherIndex(Simd d, const T* HWY_RESTRICT base, + const Vec128 index) { + return detail::GatherIndex(hwy::SizeTag(), d, base, index); +} + +template +HWY_API Vec128 GatherOffset(Simd /* tag */, + const float* HWY_RESTRICT base, + const Vec128 offset) { + return Vec128{_mm_i32gather_ps(base, offset.raw, 1)}; +} +template +HWY_API Vec128 GatherIndex(Simd /* tag */, + const float* HWY_RESTRICT base, + const Vec128 index) { + return Vec128{_mm_i32gather_ps(base, index.raw, 4)}; +} + +template +HWY_API Vec128 GatherOffset(Simd /* tag */, + const double* HWY_RESTRICT base, + const Vec128 offset) { + return Vec128{_mm_i64gather_pd(base, offset.raw, 1)}; +} +template +HWY_API Vec128 GatherIndex(Simd /* tag */, + const double* HWY_RESTRICT base, + const Vec128 index) { + return Vec128{_mm_i64gather_pd(base, index.raw, 8)}; +} + +#endif // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + +HWY_DIAGNOSTICS(pop) + +// ================================================== SWIZZLE (2) + +// ------------------------------ LowerHalf + +// Returns upper/lower half of a vector. +template +HWY_API Vec128 LowerHalf(Simd /* tag */, + Vec128 v) { + return Vec128{v.raw}; +} + +template +HWY_API Vec128 LowerHalf(Vec128 v) { + return LowerHalf(Simd(), v); +} + +// ------------------------------ ShiftLeftBytes + +template +HWY_API Vec128 ShiftLeftBytes(Simd /* tag */, Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + return Vec128{_mm_slli_si128(v.raw, kBytes)}; +} + +template +HWY_API Vec128 ShiftLeftBytes(const Vec128 v) { + return ShiftLeftBytes(DFromV(), v); +} + +// ------------------------------ ShiftLeftLanes + +template +HWY_API Vec128 ShiftLeftLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); +} + +template +HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { + return ShiftLeftLanes(DFromV(), v); +} + +// ------------------------------ ShiftRightBytes +template +HWY_API Vec128 ShiftRightBytes(Simd /* tag */, Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + // For partial vectors, clear upper lanes so we shift in zeros. + if (N != 16 / sizeof(T)) { + const Vec128 vfull{v.raw}; + v = Vec128{IfThenElseZero(FirstN(Full128(), N), vfull).raw}; + } + return Vec128{_mm_srli_si128(v.raw, kBytes)}; +} + +// ------------------------------ ShiftRightLanes +template +HWY_API Vec128 ShiftRightLanes(Simd d, const Vec128 v) { + const Repartition d8; + return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); +} + +// ------------------------------ UpperHalf (ShiftRightBytes) + +// Full input: copy hi into lo (smaller instruction encoding than shifts). +template +HWY_API Vec64 UpperHalf(Half> /* tag */, Vec128 v) { + return Vec64{_mm_unpackhi_epi64(v.raw, v.raw)}; +} +HWY_API Vec128 UpperHalf(Full64 /* tag */, Vec128 v) { + return Vec128{_mm_movehl_ps(v.raw, v.raw)}; +} +HWY_API Vec64 UpperHalf(Full64 /* tag */, Vec128 v) { + return Vec64{_mm_unpackhi_pd(v.raw, v.raw)}; +} + +// Partial +template +HWY_API Vec128 UpperHalf(Half> /* tag */, + Vec128 v) { + const DFromV d; + const RebindToUnsigned du; + const auto vu = BitCast(du, v); + const auto upper = BitCast(d, ShiftRightBytes(du, vu)); + return Vec128{upper.raw}; +} + +// ------------------------------ ExtractLane (UpperHalf) + +namespace detail { + +template +HWY_INLINE T ExtractLane(const Vec128 v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 + const int pair = _mm_extract_epi16(v.raw, kLane / 2); + constexpr int kShift = kLane & 1 ? 8 : 0; + return static_cast((pair >> kShift) & 0xFF); +#else + return static_cast(_mm_extract_epi8(v.raw, kLane) & 0xFF); +#endif +} + +template +HWY_INLINE T ExtractLane(const Vec128 v) { + static_assert(kLane < N, "Lane index out of bounds"); + return static_cast(_mm_extract_epi16(v.raw, kLane) & 0xFFFF); +} + +template +HWY_INLINE T ExtractLane(const Vec128 v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 + alignas(16) T lanes[4]; + Store(v, DFromV(), lanes); + return lanes[kLane]; +#else + return static_cast(_mm_extract_epi32(v.raw, kLane)); +#endif +} + +template +HWY_INLINE T ExtractLane(const Vec128 v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32 + alignas(16) T lanes[2]; + Store(v, DFromV(), lanes); + return lanes[kLane]; +#else + return static_cast(_mm_extract_epi64(v.raw, kLane)); +#endif +} + +template +HWY_INLINE float ExtractLane(const Vec128 v) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 + alignas(16) float lanes[4]; + Store(v, DFromV(), lanes); + return lanes[kLane]; +#else + // Bug in the intrinsic, returns int but should be float. + const int32_t bits = _mm_extract_ps(v.raw, kLane); + float ret; + CopySameSize(&bits, &ret); + return ret; +#endif +} + +// There is no extract_pd; two overloads because there is no UpperHalf for N=1. +template +HWY_INLINE double ExtractLane(const Vec128 v) { + static_assert(kLane == 0, "Lane index out of bounds"); + return GetLane(v); +} + +template +HWY_INLINE double ExtractLane(const Vec128 v) { + static_assert(kLane < 2, "Lane index out of bounds"); + const Half> dh; + return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v)); +} + +} // namespace detail + +// Requires one overload per vector length because ExtractLane<3> may be a +// compile error if it calls _mm_extract_epi64. +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { + HWY_DASSERT(i == 0); + (void)i; + return GetLane(v); +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + } + } +#endif + alignas(16) T lanes[2]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + } + } +#endif + alignas(16) T lanes[4]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + } + } +#endif + alignas(16) T lanes[8]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +template +HWY_API T ExtractLane(const Vec128 v, size_t i) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::ExtractLane<0>(v); + case 1: + return detail::ExtractLane<1>(v); + case 2: + return detail::ExtractLane<2>(v); + case 3: + return detail::ExtractLane<3>(v); + case 4: + return detail::ExtractLane<4>(v); + case 5: + return detail::ExtractLane<5>(v); + case 6: + return detail::ExtractLane<6>(v); + case 7: + return detail::ExtractLane<7>(v); + case 8: + return detail::ExtractLane<8>(v); + case 9: + return detail::ExtractLane<9>(v); + case 10: + return detail::ExtractLane<10>(v); + case 11: + return detail::ExtractLane<11>(v); + case 12: + return detail::ExtractLane<12>(v); + case 13: + return detail::ExtractLane<13>(v); + case 14: + return detail::ExtractLane<14>(v); + case 15: + return detail::ExtractLane<15>(v); + } + } +#endif + alignas(16) T lanes[16]; + Store(v, DFromV(), lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane (UpperHalf) + +namespace detail { + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 + const DFromV d; + alignas(16) T lanes[16]; + Store(v, d, lanes); + lanes[kLane] = t; + return Load(d, lanes); +#else + return Vec128{_mm_insert_epi8(v.raw, t, kLane)}; +#endif +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); + return Vec128{_mm_insert_epi16(v.raw, t, kLane)}; +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 + alignas(16) T lanes[4]; + const DFromV d; + Store(v, d, lanes); + lanes[kLane] = t; + return Load(d, lanes); +#else + MakeSigned ti; + CopySameSize(&t, &ti); // don't just cast because T might be float. + return Vec128{_mm_insert_epi32(v.raw, ti, kLane)}; +#endif +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32 + const DFromV d; + alignas(16) T lanes[2]; + Store(v, d, lanes); + lanes[kLane] = t; + return Load(d, lanes); +#else + MakeSigned ti; + CopySameSize(&t, &ti); // don't just cast because T might be float. + return Vec128{_mm_insert_epi64(v.raw, ti, kLane)}; +#endif +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { + static_assert(kLane < N, "Lane index out of bounds"); +#if HWY_TARGET == HWY_SSSE3 + const DFromV d; + alignas(16) float lanes[4]; + Store(v, d, lanes); + lanes[kLane] = t; + return Load(d, lanes); +#else + return Vec128{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)}; +#endif +} + +// There is no insert_pd; two overloads because there is no UpperHalf for N=1. +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { + static_assert(kLane == 0, "Lane index out of bounds"); + return Set(DFromV(), t); +} + +template +HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { + static_assert(kLane < 2, "Lane index out of bounds"); + const DFromV d; + const Vec128 vt = Set(d, t); + if (kLane == 0) { + return Vec128{_mm_shuffle_pd(vt.raw, v.raw, 2)}; + } + return Vec128{_mm_shuffle_pd(v.raw, vt.raw, 0)}; +} + +} // namespace detail + +// Requires one overload per vector length because InsertLane<3> may be a +// compile error if it calls _mm_insert_epi64. + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { + HWY_DASSERT(i == 0); + (void)i; + return Set(DFromV(), t); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[2]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[4]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[8]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +template +HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { +#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang + if (__builtin_constant_p(i)) { + switch (i) { + case 0: + return detail::InsertLane<0>(v, t); + case 1: + return detail::InsertLane<1>(v, t); + case 2: + return detail::InsertLane<2>(v, t); + case 3: + return detail::InsertLane<3>(v, t); + case 4: + return detail::InsertLane<4>(v, t); + case 5: + return detail::InsertLane<5>(v, t); + case 6: + return detail::InsertLane<6>(v, t); + case 7: + return detail::InsertLane<7>(v, t); + case 8: + return detail::InsertLane<8>(v, t); + case 9: + return detail::InsertLane<9>(v, t); + case 10: + return detail::InsertLane<10>(v, t); + case 11: + return detail::InsertLane<11>(v, t); + case 12: + return detail::InsertLane<12>(v, t); + case 13: + return detail::InsertLane<13>(v, t); + case 14: + return detail::InsertLane<14>(v, t); + case 15: + return detail::InsertLane<15>(v, t); + } + } +#endif + const DFromV d; + alignas(16) T lanes[16]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ------------------------------ CombineShiftRightBytes + +template > +HWY_API V CombineShiftRightBytes(Full128 d, V hi, V lo) { + const Repartition d8; + return BitCast(d, Vec128{_mm_alignr_epi8( + BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); +} + +template > +HWY_API V CombineShiftRightBytes(Simd d, V hi, V lo) { + constexpr size_t kSize = N * sizeof(T); + static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); + const Repartition d8; + const Full128 d_full8; + using V8 = VFromD; + const V8 hi8{BitCast(d8, hi).raw}; + // Move into most-significant bytes + const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); + const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8); + return V{BitCast(Full128(), r).raw}; +} + +// ------------------------------ Broadcast/splat any lane + +// Unsigned +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + if (kLane < 4) { + const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec128{_mm_unpacklo_epi64(lo, lo)}; + } else { + const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec128{_mm_unpackhi_epi64(hi, hi)}; + } +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Signed +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + if (kLane < 4) { + const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec128{_mm_unpacklo_epi64(lo, lo)}; + } else { + const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec128{_mm_unpackhi_epi64(hi, hi)}; + } +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Float +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; +} +template +HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)}; +} + +// ------------------------------ TableLookupLanes (Shuffle01) + +// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. +template +struct Indices128 { + __m128i raw; +}; + +template +HWY_API Indices128 IndicesFromVec(Simd d, Vec128 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, N)))); +#endif + +#if HWY_TARGET <= HWY_AVX2 + (void)d; + return Indices128{vec.raw}; +#else + const Repartition d8; + using V8 = VFromD; + alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3}; + + // Broadcast each lane index to all 4 bytes of T + alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); + + // Shift to bytes + const Repartition d16; + const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); + + return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; +#endif +} + +template +HWY_API Indices128 IndicesFromVec(Simd d, Vec128 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Rebind di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast(N))))); +#else + (void)d; +#endif + + // No change - even without AVX3, we can shuffle+blend. + return Indices128{vec.raw}; +} + +template +HWY_API Indices128 SetTableIndices(Simd d, const TI* idx) { + const Rebind di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template +HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { +#if HWY_TARGET <= HWY_AVX2 + const DFromV d; + const RebindToFloat df; + const Vec128 perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)}; + return BitCast(d, perm); +#else + return TableLookupBytes(v, Vec128{idx.raw}); +#endif +} + +template +HWY_API Vec128 TableLookupLanes(Vec128 v, + Indices128 idx) { +#if HWY_TARGET <= HWY_AVX2 + return Vec128{_mm_permutevar_ps(v.raw, idx.raw)}; +#else + const DFromV df; + const RebindToSigned di; + return BitCast(df, + TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); +#endif +} + +// Single lane: no change +template +HWY_API Vec128 TableLookupLanes(Vec128 v, + Indices128 /* idx */) { + return v; +} + +template +HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { + const Full128 d; + Vec128 vidx{idx.raw}; +#if HWY_TARGET <= HWY_AVX2 + // There is no _mm_permute[x]var_epi64. + vidx += vidx; // bit1 is the decider (unusual) + const Full128 df; + return BitCast( + d, Vec128{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)}); +#else + // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit + // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 + // to obtain an all-zero or all-one mask. + const Full128 di; + const Vec128 same = (vidx ^ Iota(di, 0)) - Set(di, 1); + const Mask128 mask_same = RebindMask(d, MaskFromVec(same)); + return IfThenElse(mask_same, v, Shuffle01(v)); +#endif +} + +HWY_API Vec128 TableLookupLanes(Vec128 v, + Indices128 idx) { + Vec128 vidx{idx.raw}; +#if HWY_TARGET <= HWY_AVX2 + vidx += vidx; // bit1 is the decider (unusual) + return Vec128{_mm_permutevar_pd(v.raw, vidx.raw)}; +#else + // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit + // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 + // to obtain an all-zero or all-one mask. + const Full128 d; + const Full128 di; + const Vec128 same = (vidx ^ Iota(di, 0)) - Set(di, 1); + const Mask128 mask_same = RebindMask(d, MaskFromVec(same)); + return IfThenElse(mask_same, v, Shuffle01(v)); +#endif +} + +// ------------------------------ ReverseBlocks + +// Single block: no change +template +HWY_API Vec128 ReverseBlocks(Full128 /* tag */, const Vec128 v) { + return v; +} + +// ------------------------------ Reverse (Shuffle0123, Shuffle2301) + +// Single lane: no change +template +HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { + return v; +} + +// Two lanes: shuffle +template +HWY_API Vec128 Reverse(Full64 /* tag */, const Vec128 v) { + return Vec128{Shuffle2301(Vec128{v.raw}).raw}; +} + +template +HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { + return Shuffle01(v); +} + +// Four lanes: shuffle +template +HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { + return Shuffle0123(v); +} + +// 16-bit +template +HWY_API Vec128 Reverse(Simd d, const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + if (N == 1) return v; + if (N == 2) { + const Repartition du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); + } + const RebindToSigned di; + alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + const Vec128 idx = Load(di, kReverse + (N == 8 ? 0 : 4)); + return BitCast(d, Vec128{ + _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +#else + const RepartitionToWide> du32; + return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); +#endif +} + +// ------------------------------ Reverse2 + +template +HWY_API Vec128 Reverse2(Simd d, const Vec128 v) { + const Repartition du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +} + +template +HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { + return Shuffle2301(v); +} + +template +HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 + +template +HWY_API Vec128 Reverse4(Simd d, const Vec128 v) { + const RebindToSigned di; + // 4x 16-bit: a single shufflelo suffices. + if (N == 4) { + return BitCast(d, Vec128{_mm_shufflelo_epi16( + BitCast(di, v).raw, _MM_SHUFFLE(0, 1, 2, 3))}); + } + +#if HWY_TARGET <= HWY_AVX3 + alignas(16) constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4}; + const Vec128 idx = Load(di, kReverse4); + return BitCast(d, Vec128{ + _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +#else + const RepartitionToWide dw; + return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v)))); +#endif +} + +// 4x 32-bit: use Shuffle0123 +template +HWY_API Vec128 Reverse4(Full128 /* tag */, const Vec128 v) { + return Shuffle0123(v); +} + +template +HWY_API Vec128 Reverse4(Simd /* tag */, Vec128 /* v */) { + HWY_ASSERT(0); // don't have 4 u64 lanes +} + +// ------------------------------ Reverse8 + +template +HWY_API Vec128 Reverse8(Simd d, const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToSigned di; + alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8}; + const Vec128 idx = Load(di, kReverse8); + return BitCast(d, Vec128{ + _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +#else + const RepartitionToWide dw; + return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); +#endif +} + +template +HWY_API Vec128 Reverse8(Simd /* tag */, Vec128 /* v */) { + HWY_ASSERT(0); // don't have 8 lanes unless 16-bit +} + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). + +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi32(a.raw, b.raw)}; +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi64(a.raw, b.raw)}; +} + +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi8(a.raw, b.raw)}; +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi16(a.raw, b.raw)}; +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi32(a.raw, b.raw)}; +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_epi64(a.raw, b.raw)}; +} + +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_ps(a.raw, b.raw)}; +} +template +HWY_API Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpacklo_pd(a.raw, b.raw)}; +} + +// Additional overload for the optional tag (also for 256/512). +template +HWY_API V InterleaveLower(DFromV /* tag */, V a, V b) { + return InterleaveLower(a, b); +} + +// ------------------------------ InterleaveUpper (UpperHalf) + +// All functions inside detail lack the required D parameter. +namespace detail { + +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_ps(a.raw, b.raw)}; +} +HWY_API Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_unpackhi_pd(a.raw, b.raw)}; +} + +} // namespace detail + +// Full +template > +HWY_API V InterleaveUpper(Full128 /* tag */, V a, V b) { + return detail::InterleaveUpper(a, b); +} + +// Partial +template > +HWY_API V InterleaveUpper(Simd d, V a, V b) { + const Half d2; + return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw}); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template >> +HWY_API VFromD ZipLower(V a, V b) { + return BitCast(DW(), InterleaveLower(a, b)); +} +template , class DW = RepartitionToWide> +HWY_API VFromD ZipLower(DW dw, V a, V b) { + return BitCast(dw, InterleaveLower(D(), a, b)); +} + +template , class DW = RepartitionToWide> +HWY_API VFromD ZipUpper(DW dw, V a, V b) { + return BitCast(dw, InterleaveUpper(D(), a, b)); +} + +// ================================================== COMBINE + +// ------------------------------ Combine (InterleaveLower) + +// N = N/2 + N/2 (upper half undefined) +template +HWY_API Vec128 Combine(Simd d, Vec128 hi_half, + Vec128 lo_half) { + const Half d2; + const RebindToUnsigned du2; + // Treat half-width input as one lane, and expand to two lanes. + using VU = Vec128, 2>; + const VU lo{BitCast(du2, lo_half).raw}; + const VU hi{BitCast(du2, hi_half).raw}; + return BitCast(d, InterleaveLower(lo, hi)); +} + +// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_INLINE Vec128 ZeroExtendVector(hwy::NonFloatTag /*tag*/, + Full128 /* d */, Vec64 lo) { + return Vec128{_mm_move_epi64(lo.raw)}; +} + +template +HWY_INLINE Vec128 ZeroExtendVector(hwy::FloatTag /*tag*/, Full128 d, + Vec64 lo) { + const RebindToUnsigned du; + return BitCast(d, ZeroExtendVector(du, BitCast(Half(), lo))); +} + +} // namespace detail + +template +HWY_API Vec128 ZeroExtendVector(Full128 d, Vec64 lo) { + return detail::ZeroExtendVector(hwy::IsFloatTag(), d, lo); +} + +template +HWY_API Vec128 ZeroExtendVector(Simd d, Vec128 lo) { + return IfThenElseZero(FirstN(d, N / 2), Vec128{lo.raw}); +} + +// ------------------------------ Concat full (InterleaveLower) + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template +HWY_API Vec128 ConcatLowerLower(Full128 d, Vec128 hi, Vec128 lo) { + const Repartition d64; + return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); +} + +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template +HWY_API Vec128 ConcatUpperUpper(Full128 d, Vec128 hi, Vec128 lo) { + const Repartition d64; + return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); +} + +// hiH,hiL loH,loL |-> hiL,loH (= inner halves) +template +HWY_API Vec128 ConcatLowerUpper(Full128 d, const Vec128 hi, + const Vec128 lo) { + return CombineShiftRightBytes<8>(d, hi, lo); +} + +// hiH,hiL loH,loL |-> hiH,loL (= outer halves) +template +HWY_API Vec128 ConcatUpperLower(Full128 d, Vec128 hi, Vec128 lo) { + const Repartition dd; +#if HWY_TARGET == HWY_SSSE3 + return BitCast( + d, Vec128{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw, + _MM_SHUFFLE2(1, 0))}); +#else + // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle. + return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, hi).raw, + BitCast(dd, lo).raw, 1)}); +#endif +} +HWY_API Vec128 ConcatUpperLower(Full128 d, Vec128 hi, + Vec128 lo) { +#if HWY_TARGET == HWY_SSSE3 + (void)d; + return Vec128{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))}; +#else + // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle. + const RepartitionToWide dd; + return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, hi).raw, + BitCast(dd, lo).raw, 1)}); +#endif +} +HWY_API Vec128 ConcatUpperLower(Full128 /* tag */, + Vec128 hi, Vec128 lo) { +#if HWY_TARGET == HWY_SSSE3 + return Vec128{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))}; +#else + // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. + return Vec128{_mm_blend_pd(hi.raw, lo.raw, 1)}; +#endif +} + +// ------------------------------ Concat partial (Combine, LowerHalf) + +template +HWY_API Vec128 ConcatLowerLower(Simd d, Vec128 hi, + Vec128 lo) { + const Half d2; + return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); +} + +template +HWY_API Vec128 ConcatUpperUpper(Simd d, Vec128 hi, + Vec128 lo) { + const Half d2; + return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); +} + +template +HWY_API Vec128 ConcatLowerUpper(Simd d, const Vec128 hi, + const Vec128 lo) { + const Half d2; + return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); +} + +template +HWY_API Vec128 ConcatUpperLower(Simd d, Vec128 hi, + Vec128 lo) { + const Half d2; + return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); +} + +// ------------------------------ ConcatOdd + +// 8-bit full +template +HWY_API Vec128 ConcatOdd(Full128 d, Vec128 hi, Vec128 lo) { + const Repartition dw; + // Right-shift 8 bits per u16 so we can pack. + const Vec128 uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec128 uL = ShiftRight<8>(BitCast(dw, lo)); + return Vec128{_mm_packus_epi16(uL.raw, uH.raw)}; +} + +// 8-bit x8 +template +HWY_API Vec64 ConcatOdd(Simd d, Vec64 hi, Vec64 lo) { + const Repartition du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7}; + const Vec64 shuf = BitCast(d, Load(Full64(), kCompactOddU8)); + const Vec64 L = TableLookupBytes(lo, shuf); + const Vec64 H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +} + +// 8-bit x4 +template +HWY_API Vec32 ConcatOdd(Simd d, Vec32 hi, Vec32 lo) { + const Repartition du16; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactOddU8[4] = {1, 3}; + const Vec32 shuf = BitCast(d, Load(Full32(), kCompactOddU8)); + const Vec32 L = TableLookupBytes(lo, shuf); + const Vec32 H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); +} + +// 16-bit full +template +HWY_API Vec128 ConcatOdd(Full128 d, Vec128 hi, Vec128 lo) { + // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns + // 0xFFFF8000, which correctly saturates to 0x8000. + const Repartition dw; + const Vec128 uH = ShiftRight<16>(BitCast(dw, hi)); + const Vec128 uL = ShiftRight<16>(BitCast(dw, lo)); + return Vec128{_mm_packs_epi32(uL.raw, uH.raw)}; +} + +// 16-bit x4 +template +HWY_API Vec64 ConcatOdd(Simd d, Vec64 hi, Vec64 lo) { + const Repartition du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7}; + const Vec64 shuf = BitCast(d, Load(Full64(), kCompactOddU16)); + const Vec64 L = TableLookupBytes(lo, shuf); + const Vec64 H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +} + +// 32-bit full +template +HWY_API Vec128 ConcatOdd(Full128 d, Vec128 hi, Vec128 lo) { + const RebindToFloat df; + return BitCast( + d, Vec128{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, + _MM_SHUFFLE(3, 1, 3, 1))}); +} +template +HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, + Vec128 lo) { + return Vec128{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))}; +} + +// Any type x2 +template +HWY_API Vec128 ConcatOdd(Simd d, Vec128 hi, + Vec128 lo) { + return InterleaveUpper(d, lo, hi); +} + +// ------------------------------ ConcatEven (InterleaveLower) + +// 8-bit full +template +HWY_API Vec128 ConcatEven(Full128 d, Vec128 hi, Vec128 lo) { + const Repartition dw; + // Isolate lower 8 bits per u16 so we can pack. + const Vec128 mask = Set(dw, 0x00FF); + const Vec128 uH = And(BitCast(dw, hi), mask); + const Vec128 uL = And(BitCast(dw, lo), mask); + return Vec128{_mm_packus_epi16(uL.raw, uH.raw)}; +} + +// 8-bit x8 +template +HWY_API Vec64 ConcatEven(Simd d, Vec64 hi, Vec64 lo) { + const Repartition du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6}; + const Vec64 shuf = BitCast(d, Load(Full64(), kCompactEvenU8)); + const Vec64 L = TableLookupBytes(lo, shuf); + const Vec64 H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +} + +// 8-bit x4 +template +HWY_API Vec32 ConcatEven(Simd d, Vec32 hi, Vec32 lo) { + const Repartition du16; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2}; + const Vec32 shuf = BitCast(d, Load(Full32(), kCompactEvenU8)); + const Vec32 L = TableLookupBytes(lo, shuf); + const Vec32 H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); +} + +// 16-bit full +template +HWY_API Vec128 ConcatEven(Full128 d, Vec128 hi, Vec128 lo) { +#if HWY_TARGET <= HWY_SSE4 + // Isolate lower 16 bits per u32 so we can pack. + const Repartition dw; + const Vec128 mask = Set(dw, 0x0000FFFF); + const Vec128 uH = And(BitCast(dw, hi), mask); + const Vec128 uL = And(BitCast(dw, lo), mask); + return Vec128{_mm_packus_epi32(uL.raw, uH.raw)}; +#else + // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two + // inputs, then concatenate them. + alignas(16) const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; + const Vec128 shuf = BitCast(d, Load(d, kCompactEvenU16)); + const Vec128 L = TableLookupBytes(lo, shuf); + const Vec128 H = TableLookupBytes(hi, shuf); + return ConcatLowerLower(d, H, L); +#endif +} + +// 16-bit x4 +template +HWY_API Vec64 ConcatEven(Simd d, Vec64 hi, Vec64 lo) { + const Repartition du32; + // Don't care about upper half, no need to zero. + alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5}; + const Vec64 shuf = BitCast(d, Load(Full64(), kCompactEvenU16)); + const Vec64 L = TableLookupBytes(lo, shuf); + const Vec64 H = TableLookupBytes(hi, shuf); + return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); +} + +// 32-bit full +template +HWY_API Vec128 ConcatEven(Full128 d, Vec128 hi, Vec128 lo) { + const RebindToFloat df; + return BitCast( + d, Vec128{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, + _MM_SHUFFLE(2, 0, 2, 0))}); +} +HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, + Vec128 lo) { + return Vec128{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; +} + +// Any T x2 +template +HWY_API Vec128 ConcatEven(Simd d, Vec128 hi, + Vec128 lo) { + return InterleaveLower(d, lo, hi); +} + +// ------------------------------ DupEven (InterleaveLower) + +template +HWY_API Vec128 DupEven(Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} +template +HWY_API Vec128 DupEven(Vec128 v) { + return Vec128{ + _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} + +template +HWY_API Vec128 DupEven(const Vec128 v) { + return InterleaveLower(DFromV(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template +HWY_API Vec128 DupOdd(Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} +template +HWY_API Vec128 DupOdd(Vec128 v) { + return Vec128{ + _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} + +template +HWY_API Vec128 DupOdd(const Vec128 v) { + return InterleaveUpper(DFromV(), v, v); +} + +// ------------------------------ OddEven (IfThenElse) + +template +HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { + const DFromV d; + const Repartition d8; + alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); +} + +template +HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + const DFromV d; + const Repartition d8; + alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, + 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); +#else + return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x55)}; +#endif +} + +template +HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1)); + const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0)); + return Vec128{_mm_unpacklo_epi32(even, odd)}; +#else + // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle. + const DFromV d; + const RebindToFloat df; + return BitCast(d, Vec128{_mm_blend_ps(BitCast(df, a).raw, + BitCast(df, b).raw, 5)}); +#endif +} + +template +HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { + // Same as ConcatUpperLower for full vectors; do not call that because this + // is more efficient for 64x1 vectors. + const DFromV d; + const RebindToFloat dd; +#if HWY_TARGET == HWY_SSSE3 + return BitCast( + d, Vec128{_mm_shuffle_pd( + BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))}); +#else + // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. + return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, a).raw, + BitCast(dd, b).raw, 1)}); +#endif +} + +template +HWY_API Vec128 OddEven(Vec128 a, Vec128 b) { +#if HWY_TARGET == HWY_SSSE3 + // SHUFPS must fill the lower half of the output from one input, so we + // need another shuffle. Unpack avoids another immediate byte. + const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1)); + const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0)); + return Vec128{_mm_unpacklo_ps(even, odd)}; +#else + return Vec128{_mm_blend_ps(a.raw, b.raw, 5)}; +#endif +} + +// ------------------------------ OddEvenBlocks +template +HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { + return even; +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { + return v; +} + +// ------------------------------ Shl (ZipLower, Mul) + +// Use AVX2/3 variable shifts where available, otherwise multiply by powers of +// two from loading float exponents, which is considerably faster (according +// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v. + +namespace detail { +#if HWY_TARGET > HWY_AVX3 // AVX2 or older + +// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. +template +HWY_INLINE Vec128, N> Pow2(const Vec128 v) { + const DFromV d; + const RepartitionToWide dw; + const Rebind df; + const auto zero = Zero(d); + // Move into exponent (this u16 will become the upper half of an f32) + const auto exp = ShiftLeft<23 - 16>(v); + const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f + // Insert 0 into lower halves for reinterpreting as binary32. + const auto f0 = ZipLower(dw, zero, upper); + const auto f1 = ZipUpper(dw, zero, upper); + // See comment below. + const Vec128 bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)}; + const Vec128 bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)}; + return Vec128, N>{_mm_packus_epi32(bits0.raw, bits1.raw)}; +} + +// Same, for 32-bit shifts. +template +HWY_INLINE Vec128, N> Pow2(const Vec128 v) { + const DFromV d; + const auto exp = ShiftLeft<23>(v); + const auto f = exp + Set(d, 0x3F800000); // 1.0f + // Do not use ConvertTo because we rely on the native 0x80..00 overflow + // behavior. cvt instead of cvtt should be equivalent, but avoids test + // failure under GCC 10.2.1. + return Vec128, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))}; +} + +#endif // HWY_TARGET > HWY_AVX3 + +template +HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, + Vec128 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_sllv_epi16(v.raw, bits.raw)}; +#else + return v * Pow2(bits); +#endif +} +HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, + Vec128 bits) { + return Vec128{_mm_sll_epi16(v.raw, bits.raw)}; +} + +template +HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, + Vec128 bits) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + return v * Pow2(bits); +#else + return Vec128{_mm_sllv_epi32(v.raw, bits.raw)}; +#endif +} +HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, + const Vec128 bits) { + return Vec128{_mm_sll_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, + Vec128 bits) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + // Individual shifts and combine + const Vec128 out0{_mm_sll_epi64(v.raw, bits.raw)}; + const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); + const Vec128 out1{_mm_sll_epi64(v.raw, bits1)}; + return ConcatUpperLower(Full128(), out1, out0); +#else + return Vec128{_mm_sllv_epi64(v.raw, bits.raw)}; +#endif +} +HWY_API Vec64 Shl(hwy::UnsignedTag /*tag*/, Vec64 v, + Vec64 bits) { + return Vec64{_mm_sll_epi64(v.raw, bits.raw)}; +} + +// Signed left shift is the same as unsigned. +template +HWY_API Vec128 Shl(hwy::SignedTag /*tag*/, Vec128 v, + Vec128 bits) { + const DFromV di; + const RebindToUnsigned du; + return BitCast(di, + Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); +} + +} // namespace detail + +template +HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { + return detail::Shl(hwy::TypeTag(), v, bits); +} + +// ------------------------------ Shr (mul, mask, BroadcastSignBit) + +// Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use +// widening multiplication by powers of two obtained by loading float exponents, +// followed by a constant right-shift. This is still faster than a scalar or +// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v. + +template +HWY_API Vec128 operator>>(const Vec128 in, + const Vec128 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_srlv_epi16(in.raw, bits.raw)}; +#else + const Simd d; + // For bits=0, we cannot mul by 2^16, so fix the result later. + const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits)); + // Replace output with input where bits == 0. + return IfThenElse(bits == Zero(d), in, out); +#endif +} +HWY_API Vec128 operator>>(const Vec128 in, + const Vec128 bits) { + return Vec128{_mm_srl_epi16(in.raw, bits.raw)}; +} + +template +HWY_API Vec128 operator>>(const Vec128 in, + const Vec128 bits) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + // 32x32 -> 64 bit mul, then shift right by 32. + const Simd d32; + // Move odd lanes into position for the second mul. Shuffle more gracefully + // handles N=1 than repartitioning to u64 and shifting 32 bits right. + const Vec128 in31{_mm_shuffle_epi32(in.raw, 0x31)}; + // For bits=0, we cannot mul by 2^32, so fix the result later. + const auto mul = detail::Pow2(Set(d32, 32) - bits); + const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0 + const Vec128 mul31{_mm_shuffle_epi32(mul.raw, 0x31)}; + // No need to shift right, already in the correct position. + const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ? + const Vec128 out = OddEven(out31, BitCast(d32, out20)); + // Replace output with input where bits == 0. + return IfThenElse(bits == Zero(d32), in, out); +#else + return Vec128{_mm_srlv_epi32(in.raw, bits.raw)}; +#endif +} +HWY_API Vec128 operator>>(const Vec128 in, + const Vec128 bits) { + return Vec128{_mm_srl_epi32(in.raw, bits.raw)}; +} + +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { +#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 + // Individual shifts and combine + const Vec128 out0{_mm_srl_epi64(v.raw, bits.raw)}; + const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); + const Vec128 out1{_mm_srl_epi64(v.raw, bits1)}; + return ConcatUpperLower(Full128(), out1, out0); +#else + return Vec128{_mm_srlv_epi64(v.raw, bits.raw)}; +#endif +} +HWY_API Vec64 operator>>(const Vec64 v, + const Vec64 bits) { + return Vec64{_mm_srl_epi64(v.raw, bits.raw)}; +} + +#if HWY_TARGET > HWY_AVX3 // AVX2 or older +namespace detail { + +// Also used in x86_256-inl.h. +template +HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) { + const RebindToUnsigned du; + const auto count = BitCast(du, count_i); // same type as value to shift + // Clear sign and restore afterwards. This is preferable to shifting the MSB + // downwards because Shr is somewhat more expensive than Shl. + const auto sign = BroadcastSignBit(v); + const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below + return BitCast(di, abs >> count) ^ sign; +} + +} // namespace detail +#endif // HWY_TARGET > HWY_AVX3 + +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_srav_epi16(v.raw, bits.raw)}; +#else + return detail::SignedShr(Simd(), v, bits); +#endif +} +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128{_mm_sra_epi16(v.raw, bits.raw)}; +} + +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_srav_epi32(v.raw, bits.raw)}; +#else + return detail::SignedShr(Simd(), v, bits); +#endif +} +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128{_mm_sra_epi32(v.raw, bits.raw)}; +} + +template +HWY_API Vec128 operator>>(const Vec128 v, + const Vec128 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_srav_epi64(v.raw, bits.raw)}; +#else + return detail::SignedShr(Simd(), v, bits); +#endif +} + +// ------------------------------ MulEven/Odd 64x64 (UpperHalf) + +HWY_INLINE Vec128 MulEven(const Vec128 a, + const Vec128 b) { + alignas(16) uint64_t mul[2]; + mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); + return Load(Full128(), mul); +} + +HWY_INLINE Vec128 MulOdd(const Vec128 a, + const Vec128 b) { + alignas(16) uint64_t mul[2]; + const Half> d2; + mul[0] = + Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); + return Load(Full128(), mul); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +template +HWY_API Vec128 ReorderWidenMulAccumulate(Simd df32, + Vec128 a, + Vec128 b, + const Vec128 sum0, + Vec128& sum1) { + // TODO(janwas): _mm_dpbf16_ps when available + const Repartition du16; + const RebindToUnsigned du32; + const Vec128 zero = Zero(du16); + // Lane order within sum0/1 is undefined, hence we can avoid the + // longer-latency lane-crossing PromoteTo. + const Vec128 a0 = ZipLower(du32, zero, BitCast(du16, a)); + const Vec128 a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const Vec128 b0 = ZipLower(du32, zero, BitCast(du16, b)); + const Vec128 b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. +template +HWY_API Vec128 ReorderWidenMulAccumulate( + Simd /*d32*/, Vec128 a, + Vec128 b, const Vec128 sum0, + Vec128& /*sum1*/) { + return sum0 + Vec128{_mm_madd_epi16(a.raw, b.raw)}; +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend. +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + const __m128i zero = _mm_setzero_si128(); + return Vec128{_mm_unpacklo_epi8(v.raw, zero)}; +#else + return Vec128{_mm_cvtepu8_epi16(v.raw)}; +#endif +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + return Vec128{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())}; +#else + return Vec128{_mm_cvtepu16_epi32(v.raw)}; +#endif +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + return Vec128{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())}; +#else + return Vec128{_mm_cvtepu32_epi64(v.raw)}; +#endif +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + const __m128i zero = _mm_setzero_si128(); + const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero); + return Vec128{_mm_unpacklo_epi16(u16, zero)}; +#else + return Vec128{_mm_cvtepu8_epi32(v.raw)}; +#endif +} + +// Unsigned to signed: same plus cast. +template +HWY_API Vec128 PromoteTo(Simd di, + const Vec128 v) { + return BitCast(di, PromoteTo(Simd(), v)); +} +template +HWY_API Vec128 PromoteTo(Simd di, + const Vec128 v) { + return BitCast(di, PromoteTo(Simd(), v)); +} +template +HWY_API Vec128 PromoteTo(Simd di, + const Vec128 v) { + return BitCast(di, PromoteTo(Simd(), v)); +} + +// Signed: replicate sign bit. +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + return ShiftRight<8>(Vec128{_mm_unpacklo_epi8(v.raw, v.raw)}); +#else + return Vec128{_mm_cvtepi8_epi16(v.raw)}; +#endif +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + return ShiftRight<16>(Vec128{_mm_unpacklo_epi16(v.raw, v.raw)}); +#else + return Vec128{_mm_cvtepi16_epi32(v.raw)}; +#endif +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + return ShiftRight<32>(Vec128{_mm_unpacklo_epi32(v.raw, v.raw)}); +#else + return Vec128{_mm_cvtepi32_epi64(v.raw)}; +#endif +} +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw); + const __m128i x4 = _mm_unpacklo_epi16(x2, x2); + return ShiftRight<24>(Vec128{x4}); +#else + return Vec128{_mm_cvtepi8_epi32(v.raw)}; +#endif +} + +// Workaround for origin tracking bug in Clang msan prior to 11.0 +// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") +#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) +#define HWY_INLINE_F16 HWY_NOINLINE +#else +#define HWY_INLINE_F16 HWY_INLINE +#endif +template +HWY_INLINE_F16 Vec128 PromoteTo(Simd df32, + const Vec128 v) { +#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C) + const RebindToSigned di32; + const RebindToUnsigned du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, Vec128{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +#else + (void)df32; + return Vec128{_mm_cvtph_ps(v.raw)}; +#endif +} + +template +HWY_API Vec128 PromoteTo(Simd df32, + const Vec128 v) { + const Rebind du16; + const RebindToSigned di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_cvtps_pd(v.raw)}; +} + +template +HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_cvtepi32_pd(v.raw)}; +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { +#if HWY_TARGET == HWY_SSSE3 + const Simd di32; + const Simd du16; + const auto zero_if_neg = AndNot(ShiftRight<31>(v), v); + const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF))); + const auto clamped = Or(zero_if_neg, too_big); + // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. + alignas(16) constexpr uint16_t kLower2Bytes[16] = { + 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; + const auto lo2 = Load(du16, kLower2Bytes); + return Vec128{TableLookupBytes(BitCast(du16, clamped), lo2).raw}; +#else + return Vec128{_mm_packus_epi32(v.raw, v.raw)}; +#endif +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_packs_epi32(v.raw, v.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); + return Vec128{_mm_packus_epi16(i16, i16)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_packus_epi16(v.raw, v.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); + return Vec128{_mm_packs_epi16(i16, i16)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_packs_epi16(v.raw, v.raw)}; +} + +// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate). +// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain") + +template +HWY_API Vec128 DemoteTo(Simd df16, + const Vec128 v) { +#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C) + const RebindToUnsigned du16; + const Rebind du; + const RebindToSigned di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return BitCast(df16, DemoteTo(du16, bits16)); +#else + (void)df16; + return Vec128{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; +#endif +} + +HWY_DIAGNOSTICS(pop) + +template +HWY_API Vec128 DemoteTo(Simd dbf16, + const Vec128 v) { + // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16. + const Rebind di32; + const Rebind du32; // for logical shift right + const Rebind du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +template +HWY_API Vec128 ReorderDemote2To( + Simd dbf16, Vec128 a, Vec128 b) { + // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16. + const RebindToUnsigned du16; + const Repartition du32; + const Vec128 b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +// Specializations for partial vectors because packs_epi32 sets lanes above 2*N. +HWY_API Vec128 ReorderDemote2To(Simd dn, + Vec128 a, + Vec128 b) { + const Half dnh; + // Pretend the result has twice as many lanes so we can InterleaveLower. + const Vec128 an{DemoteTo(dnh, a).raw}; + const Vec128 bn{DemoteTo(dnh, b).raw}; + return InterleaveLower(an, bn); +} +HWY_API Vec128 ReorderDemote2To(Simd dn, + Vec128 a, + Vec128 b) { + const Half dnh; + // Pretend the result has twice as many lanes so we can InterleaveLower. + const Vec128 an{DemoteTo(dnh, a).raw}; + const Vec128 bn{DemoteTo(dnh, b).raw}; + return InterleaveLower(an, bn); +} +HWY_API Vec128 ReorderDemote2To(Full128 /*d16*/, + Vec128 a, Vec128 b) { + return Vec128{_mm_packs_epi32(a.raw, b.raw)}; +} + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_cvtpd_ps(v.raw)}; +} + +namespace detail { + +// For well-defined float->int demotion in all x86_*-inl.h. + +template +HWY_INLINE auto ClampF64ToI32Max(Simd d, decltype(Zero(d)) v) + -> decltype(Zero(d)) { + // The max can be exactly represented in binary64, so clamping beforehand + // prevents x86 conversion from raising an exception and returning 80..00. + return Min(v, Set(d, 2147483647.0)); +} + +// For ConvertTo float->int of same size, clamping before conversion would +// change the result because the max integer value is not exactly representable. +// Instead detect the overflow result after conversion and fix it. +template > +HWY_INLINE auto FixConversionOverflow(DI di, VFromD original, + decltype(Zero(di).raw) converted_raw) + -> VFromD { + // Combinations of original and output sign: + // --: normal <0 or -huge_val to 80..00: OK + // -+: -0 to 0 : OK + // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF + // ++: normal >0 : OK + const auto converted = VFromD{converted_raw}; + const auto sign_wrong = AndNot(BitCast(di, original), converted); +#if HWY_COMPILER_GCC_ACTUAL + // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also + // Add() if using that instead. Work around with one more instruction. + const RebindToUnsigned du; + const VFromD mask = BroadcastSignBit(sign_wrong); + const VFromD max = BitCast(di, ShiftRight<1>(BitCast(du, mask))); + return IfVecThenElse(mask, max, converted); +#else + return Xor(converted, BroadcastSignBit(sign_wrong)); +#endif +} + +} // namespace detail + +template +HWY_API Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { + const auto clamped = detail::ClampF64ToI32Max(Simd(), v); + return Vec128{_mm_cvttpd_epi32(clamped.raw)}; +} + +// For already range-limited input [0, 255]. +template +HWY_API Vec128 U8FromU32(const Vec128 v) { + const Simd d32; + const Simd d8; + alignas(16) static constexpr uint32_t k8From32[4] = { + 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u}; + // Also replicate bytes into all 32 bit lanes for safety. + const auto quad = TableLookupBytes(v, Load(d32, k8From32)); + return LowerHalf(LowerHalf(BitCast(d8, quad))); +} + +// ------------------------------ Truncations + +template * = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + static_assert(!IsSigned() && !IsSigned(), "Unsigned only"); + const Repartition> d; + const auto v1 = BitCast(d, v); + return Vec128{v1.raw}; +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d8; + alignas(16) static constexpr uint8_t kMap[16] = {0, 8, 0, 8, 0, 8, 0, 8, + 0, 8, 0, 8, 0, 8, 0, 8}; + return LowerHalf(LowerHalf(LowerHalf(TableLookupBytes(v, Load(d8, kMap))))); +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Full128 d16; + alignas(16) static constexpr uint16_t kMap[8] = { + 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u}; + return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d16, kMap)))); +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_shuffle_epi32(v.raw, 0x88)}; +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + alignas(16) static constexpr uint8_t kMap[16] = { + 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu, + 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu}; + return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kMap)))); +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + return LowerHalf(ConcatEven(d, v1, v1)); +} + +template = 2>* = nullptr> +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec128 v) { + const Repartition> d; + const auto v1 = BitCast(d, v); + return LowerHalf(ConcatEven(d, v1, v1)); +} + +// ------------------------------ Integer <=> fp (ShiftRight, OddEven) + +template +HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { + return Vec128{_mm_cvtepi32_ps(v.raw)}; +} + +template +HWY_API Vec128 ConvertTo(HWY_MAYBE_UNUSED Simd df, + const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_cvtepu32_ps(v.raw)}; +#else + // Based on wim's approach (https://stackoverflow.com/questions/34066228/) + const RebindToUnsigned du32; + const RebindToSigned d32; + + const auto msk_lo = Set(du32, 0xFFFF); + const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16 + + // Extract the 16 lowest/highest significant bits of v and cast to signed int + const auto v_lo = BitCast(d32, And(v, msk_lo)); + const auto v_hi = BitCast(d32, ShiftRight<16>(v)); + return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo)); +#endif +} + +template +HWY_API Vec128 ConvertTo(Simd dd, + const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + (void)dd; + return Vec128{_mm_cvtepi64_pd(v.raw)}; +#else + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const Repartition d32; + const Repartition d64; + + // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 + const auto k84_63 = Set(d64, 0x4530000080000000ULL); + const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); + + // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) + const auto k52 = Set(d32, 0x43300000); + const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); + + const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); + return (v_upper - k84_63_52) + v_lower; // order matters! +#endif +} + +template +HWY_API Vec128 ConvertTo(HWY_MAYBE_UNUSED Simd dd, + const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec128{_mm_cvtepu64_pd(v.raw)}; +#else + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const RebindToUnsigned d64; + using VU = VFromD; + + const VU msk_lo = Set(d64, 0xFFFFFFFF); + const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 + + // Extract the 32 lowest/highest significant bits of v + const VU v_lo = And(v, msk_lo); + const VU v_hi = ShiftRight<32>(v); + + auto uint64_to_double128_fast = [&dd](VU w) HWY_ATTR { + w = Or(w, VU{detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)}); + return BitCast(dd, w) - Set(dd, 0x0010000000000000); + }; + + const auto v_lo_dbl = uint64_to_double128_fast(v_lo); + return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl); +#endif +} + +// Truncates (rounds toward zero). +template +HWY_API Vec128 ConvertTo(const Simd di, + const Vec128 v) { + return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw)); +} + +// Full (partial handled below) +HWY_API Vec128 ConvertTo(Full128 di, const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64 + return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw)); +#elif HWY_ARCH_X86_64 + const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw)); + const Half> dd2; + const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw)); + return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1)); +#else + using VI = VFromD; + const VI k0 = Zero(di); + const VI k1 = Set(di, 1); + const VI k51 = Set(di, 51); + + // Exponent indicates whether the number can be represented as int64_t. + const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF); + const VI exp = biased_exp - Set(di, 0x3FF); + const auto in_range = exp < Set(di, 63); + + // If we were to cap the exponent at 51 and add 2^52, the number would be in + // [2^52, 2^53) and mantissa bits could be read out directly. We need to + // round-to-0 (truncate), but changing rounding mode in MXCSR hits a + // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead + // manually shift the mantissa into place (we already have many of the + // inputs anyway). + const VI shift_mnt = Max(k51 - exp, k0); + const VI shift_int = Max(exp - k51, k0); + const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1); + // Include implicit 1-bit; shift by one more to ensure it's in the mantissa. + const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1); + // For inputs larger than 2^52, insert zeros at the bottom. + const VI shifted = int52 << shift_int; + // Restore the one bit lost when shifting in the implicit 1-bit. + const VI restored = shifted | ((mantissa & k1) << (shift_int - k1)); + + // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. + const VI sign_mask = BroadcastSignBit(BitCast(di, v)); + const VI limit = Set(di, LimitsMax()) - sign_mask; + const VI magnitude = IfThenElse(in_range, restored, limit); + + // If the input was negative, negate the integer (two's complement). + return (magnitude ^ sign_mask) - sign_mask; +#endif +} +HWY_API Vec64 ConvertTo(Full64 di, const Vec64 v) { + // Only need to specialize for non-AVX3, 64-bit (single scalar op) +#if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64 + const Vec64 i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))}; + return detail::FixConversionOverflow(di, v, i0.raw); +#else + (void)di; + const auto full = ConvertTo(Full128(), Vec128{v.raw}); + return Vec64{full.raw}; +#endif +} + +template +HWY_API Vec128 NearestInt(const Vec128 v) { + const Simd di; + return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw)); +} + +// ------------------------------ Floating-point rounding (ConvertTo) + +#if HWY_TARGET == HWY_SSSE3 + +// Toward nearest integer, ties to even +template +HWY_API Vec128 Round(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + // Rely on rounding after addition with a large value such that no mantissa + // bits remain (assuming the current mode is nearest-even). We may need a + // compiler flag for precise floating-point to prevent "optimizing" this out. + const Simd df; + const auto max = Set(df, MantissaEnd()); + const auto large = CopySignToAbs(max, v); + const auto added = large + v; + const auto rounded = added - large; + // Keep original if NaN or the magnitude is large (already an int). + return IfThenElse(Abs(v) < max, rounded, v); +} + +namespace detail { + +// Truncating to integer and converting back to float is correct except when the +// input magnitude is large, in which case the input was already an integer +// (because mantissa >> exponent is zero). +template +HWY_INLINE Mask128 UseInt(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + return Abs(v) < Set(Simd(), MantissaEnd()); +} + +} // namespace detail + +// Toward zero, aka truncate +template +HWY_API Vec128 Trunc(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + const Simd df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); +} + +// Toward +infinity, aka ceiling +template +HWY_API Vec128 Ceil(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + const Simd df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a positive non-integer ends up smaller; if so, add 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); + + return IfThenElse(detail::UseInt(v), int_f - neg1, v); +} + +// Toward -infinity, aka floor +template +HWY_API Vec128 Floor(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + const Simd df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a negative non-integer ends up larger; if so, subtract 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); + + return IfThenElse(detail::UseInt(v), int_f + neg1, v); +} + +#else + +// Toward nearest integer, ties to even +template +HWY_API Vec128 Round(const Vec128 v) { + return Vec128{ + _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} +template +HWY_API Vec128 Round(const Vec128 v) { + return Vec128{ + _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} + +// Toward zero, aka truncate +template +HWY_API Vec128 Trunc(const Vec128 v) { + return Vec128{ + _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} +template +HWY_API Vec128 Trunc(const Vec128 v) { + return Vec128{ + _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} + +// Toward +infinity, aka ceiling +template +HWY_API Vec128 Ceil(const Vec128 v) { + return Vec128{ + _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} +template +HWY_API Vec128 Ceil(const Vec128 v) { + return Vec128{ + _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} + +// Toward -infinity, aka floor +template +HWY_API Vec128 Floor(const Vec128 v) { + return Vec128{ + _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} +template +HWY_API Vec128 Floor(const Vec128 v) { + return Vec128{ + _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} + +#endif // !HWY_SSSE3 + +// ------------------------------ Floating-point classification + +template +HWY_API Mask128 IsNaN(const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask128{_mm_fpclass_ps_mask(v.raw, 0x81)}; +#else + return Mask128{_mm_cmpunord_ps(v.raw, v.raw)}; +#endif +} +template +HWY_API Mask128 IsNaN(const Vec128 v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask128{_mm_fpclass_pd_mask(v.raw, 0x81)}; +#else + return Mask128{_mm_cmpunord_pd(v.raw, v.raw)}; +#endif +} + +#if HWY_TARGET <= HWY_AVX3 + +template +HWY_API Mask128 IsInf(const Vec128 v) { + return Mask128{_mm_fpclass_ps_mask(v.raw, 0x18)}; +} +template +HWY_API Mask128 IsInf(const Vec128 v) { + return Mask128{_mm_fpclass_pd_mask(v.raw, 0x18)}; +} + +// Returns whether normal/subnormal/zero. +template +HWY_API Mask128 IsFinite(const Vec128 v) { + // fpclass doesn't have a flag for positive, so we have to check for inf/NaN + // and negate the mask. + return Not(Mask128{_mm_fpclass_ps_mask(v.raw, 0x99)}); +} +template +HWY_API Mask128 IsFinite(const Vec128 v) { + return Not(Mask128{_mm_fpclass_pd_mask(v.raw, 0x99)}); +} + +#else + +template +HWY_API Mask128 IsInf(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + const Simd d; + const RebindToSigned di; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); +} + +// Returns whether normal/subnormal/zero. +template +HWY_API Mask128 IsFinite(const Vec128 v) { + static_assert(IsFloat(), "Only for float"); + const Simd d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + const VFromD vu = BitCast(du, v); + // Shift left to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). MSVC seems to generate + // incorrect code if we instead add vu + vu. + const VFromD exp = + BitCast(di, ShiftRight() + 1>(ShiftLeft<1>(vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ================================================== CRYPTO + +#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3 + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API Vec128 AESRound(Vec128 state, + Vec128 round_key) { + return Vec128{_mm_aesenc_si128(state.raw, round_key.raw)}; +} + +HWY_API Vec128 AESLastRound(Vec128 state, + Vec128 round_key) { + return Vec128{_mm_aesenclast_si128(state.raw, round_key.raw)}; +} + +template +HWY_API Vec128 CLMulLower(Vec128 a, + Vec128 b) { + return Vec128{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)}; +} + +template +HWY_API Vec128 CLMulUpper(Vec128 a, + Vec128 b) { + return Vec128{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)}; +} + +#endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3 + +// ================================================== MISC + +template +struct CompressIsPartition { +#if HWY_TARGET <= HWY_AVX3 + // AVX3 supports native compress, but a table-based approach allows + // 'partitioning' (also moving mask=false lanes to the top), which helps + // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8 + // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3 + // u32x8 etc.). + enum { value = (sizeof(T) == 8) }; +#else + enum { value = 1 }; +#endif +}; + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ LoadMaskBits + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask128 LoadMaskBits(Simd /* tag */, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return Mask128::FromBits(mask_bits); +} + +// ------------------------------ StoreMaskBits + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(const Simd /* tag */, + const Mask128 mask, uint8_t* bits) { + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(&mask.raw, bits); + + // Non-full byte, need to clear the undefined upper bits. + if (N < 8) { + const int mask_bits = (1 << N) - 1; + bits[0] = static_cast(bits[0] & mask_bits); + } + + return kNumBytes; +} + +// ------------------------------ Mask testing + +// Beware: the suffix indicates the number of mask bits, not lane size! + +template +HWY_API size_t CountTrue(const Simd /* tag */, + const Mask128 mask) { + const uint64_t mask_bits = static_cast(mask.raw) & ((1u << N) - 1); + return PopCount(mask_bits); +} + +template +HWY_API size_t FindKnownFirstTrue(const Simd /* tag */, + const Mask128 mask) { + const uint32_t mask_bits = static_cast(mask.raw) & ((1u << N) - 1); + return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); +} + +template +HWY_API intptr_t FindFirstTrue(const Simd /* tag */, + const Mask128 mask) { + const uint32_t mask_bits = static_cast(mask.raw) & ((1u << N) - 1); + return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; +} + +template +HWY_API bool AllFalse(const Simd /* tag */, const Mask128 mask) { + const uint64_t mask_bits = static_cast(mask.raw) & ((1u << N) - 1); + return mask_bits == 0; +} + +template +HWY_API bool AllTrue(const Simd /* tag */, const Mask128 mask) { + const uint64_t mask_bits = static_cast(mask.raw) & ((1u << N) - 1); + // Cannot use _kortestc because we may have less than 8 mask bits. + return mask_bits == (1u << N) - 1; +} + +// ------------------------------ Compress + +#if HWY_TARGET != HWY_AVX3_DL +namespace detail { + +// Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256. +HWY_INLINE Vec128 IndicesForCompress16(uint64_t mask_bits) { + Full128 du16; + // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked + // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used) + // bits into each lane and then varshift, but that does not fit in 16 bits. + Rebind du8; + alignas(16) constexpr uint8_t tbl[2048] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0, + 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, + 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, + 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, + 0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, + 0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2, + 3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1, + 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0, + 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0, + 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0, + 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, + 0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0, + 0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, + 1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0, + 2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4, + 5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3, + 4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4, + 5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, + 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, + 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, + 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0, + 0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1, + 2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, + 6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6, + 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4, + 6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0, + 0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0, + 0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0, + 0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, + 2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0, + 1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3, + 5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3, + 5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0, + 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6, + 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0, + 0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0, + 0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0, + 0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1, + 7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7, + 0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0, + 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0, + 0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0, + 0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0, + 0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, + 1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0, + 3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3, + 4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2, + 3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0, + 0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0, + 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0, + 0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, + 0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0, + 0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1, + 4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2, + 4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5, + 7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4, + 5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5, + 7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0, + 0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0, + 0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, + 3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0, + 1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2, + 3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6, + 7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7, + 0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6, + 7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0, + 0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0, + 0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0, + 0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2, + 5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1, + 2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5, + 6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5, + 6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0, + 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7, + 0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0, + 0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0, + 1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0, + 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7}; + return PromoteTo(du16, Load(du8, tbl + mask_bits * 8)); +} + +} // namespace detail +#endif // HWY_TARGET != HWY_AVX3_DL + +// Single lane: no-op +template +HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { + return v; +} + +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + const Simd d; + const Rebind du; + const auto vu = BitCast(du, v); // (required for float16_t inputs) + +#if HWY_TARGET == HWY_AVX3_DL // VBMI2 + const Vec128 cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)}; +#else + const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw}); + const Vec128 cu{_mm_permutexvar_epi16(idx.raw, vu.raw)}; +#endif // HWY_TARGET != HWY_AVX3_DL + return BitCast(d, cu); +} + +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + return Vec128{_mm_maskz_compress_epi32(mask.raw, v.raw)}; +} + +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + return Vec128{_mm_maskz_compress_ps(mask.raw, v.raw)}; +} + +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + HWY_DASSERT(mask.raw < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Full128 d; + const Repartition d8; + const auto index = Load(d8, u8_indices + 16 * mask.raw); + return BitCast(d, TableLookupBytes(BitCast(d8, v), index)); +} + +// ------------------------------ CompressNot (Compress) + +// Single lane: no-op +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { + return v; +} + +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + return Compress(v, Not(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128 CompressBlocksNot(Vec128 v, + Mask128 /* m */) { + return v; +} + +// ------------------------------ CompressBits (LoadMaskBits) + +template +HWY_API Vec128 CompressBits(Vec128 v, + const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(Simd(), bits)); +} + +// ------------------------------ CompressStore + +template +HWY_API size_t CompressStore(Vec128 v, Mask128 mask, + Simd d, T* HWY_RESTRICT unaligned) { + const Rebind du; + const auto vu = BitCast(du, v); // (required for float16_t inputs) + + const uint64_t mask_bits{mask.raw}; + +#if HWY_TARGET == HWY_AVX3_DL // VBMI2 + _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw); +#else + const auto idx = detail::IndicesForCompress16(mask_bits); + const Vec128 cu{_mm_permutexvar_epi16(idx.raw, vu.raw)}; + StoreU(BitCast(d, cu), d, unaligned); +#endif // HWY_TARGET == HWY_AVX3_DL + + const size_t count = PopCount(mask_bits & ((1ull << N) - 1)); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressStore(Vec128 v, Mask128 mask, + Simd /* tag */, + T* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressStore(Vec128 v, Mask128 mask, + Simd /* tag */, + T* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressStore(Vec128 v, Mask128 mask, + Simd /* tag */, + float* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(float)); +#endif + return count; +} + +template +HWY_API size_t CompressStore(Vec128 v, Mask128 mask, + Simd /* tag */, + double* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(double)); +#endif + return count; +} + +// ------------------------------ CompressBlendedStore (CompressStore) +template +HWY_API size_t CompressBlendedStore(Vec128 v, Mask128 m, + Simd d, + T* HWY_RESTRICT unaligned) { + // AVX-512 already does the blending at no extra cost (latency 11, + // rthroughput 2 - same as compress plus store). + if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) { + // We're relying on the mask to blend. Clear the undefined upper bits. + if (N != 16 / sizeof(T)) { + m = And(m, FirstN(d, N)); + } + return CompressStore(v, m, d, unaligned); + } else { + const size_t count = CountTrue(d, m); + const Vec128 compressed = Compress(v, m); +#if HWY_MEM_OPS_MIGHT_FAULT + // BlendedStore tests mask for each lane, but we know that the mask is + // FirstN, so we can just copy. + alignas(16) T buf[N]; + Store(compressed, d, buf); + memcpy(unaligned, buf, count * sizeof(T)); +#else + BlendedStore(compressed, FirstN(d, count), d, unaligned); +#endif + // Workaround: as of 2022-02-23 MSAN does not mark the output as + // initialized. +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; + } +} + +// ------------------------------ CompressBitsStore (LoadMaskBits) + +template +HWY_API size_t CompressBitsStore(Vec128 v, + const uint8_t* HWY_RESTRICT bits, + Simd d, T* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +#else // AVX2 or below + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + // Easier than Set(), which would require an >8-bit type, which would not + // compile for T=uint8_t, N=1. + const Vec128 vbits{_mm_cvtsi32_si128(static_cast(mask_bits))}; + + // Replicate bytes 8x such that each byte contains the bit that governs it. + alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1}; + const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); + + alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + const auto vmask_bits = Set(du, static_cast(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8}; + const auto vmask_bits = Set(du, static_cast(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template +HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(16) constexpr uint64_t kBit[8] = {1, 2}; + return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask128 LoadMaskBits(Simd d, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::LoadMaskBits(d, mask_bits); +} + +// ------------------------------ StoreMaskBits + +namespace detail { + +constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { + return static_cast(static_cast(mask_bits)); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { + const Simd d; + const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; + return U64FromInt(_mm_movemask_epi8(sign_bits)); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128 mask) { + // Remove useless lower half of each u16 while preserving the sign bit. + const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); + return U64FromInt(_mm_movemask_epi8(sign_bits)); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, + const Mask128 mask) { + const Simd d; + const Simd df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)); + return U64FromInt(_mm_movemask_ps(sign_bits.raw)); +} + +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, + const Mask128 mask) { + const Simd d; + const Simd df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)); + return U64FromInt(_mm_movemask_pd(sign_bits.raw)); +} + +// Returns the lowest N of the _mm_movemask* bits. +template +constexpr uint64_t OnlyActive(uint64_t mask_bits) { + return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); +} + +template +HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(const Simd /* tag */, + const Mask128 mask, uint8_t* bits) { + constexpr size_t kNumBytes = (N + 7) / 8; + const uint64_t mask_bits = detail::BitsFromMask(mask); + CopyBytes(&mask_bits, bits); + return kNumBytes; +} + +// ------------------------------ Mask testing + +template +HWY_API bool AllFalse(const Simd /* tag */, const Mask128 mask) { + // Cheaper than PTEST, which is 2 uop / 3L. + return detail::BitsFromMask(mask) == 0; +} + +template +HWY_API bool AllTrue(const Simd /* tag */, const Mask128 mask) { + constexpr uint64_t kAllBits = + detail::OnlyActive((1ull << (16 / sizeof(T))) - 1); + return detail::BitsFromMask(mask) == kAllBits; +} + +template +HWY_API size_t CountTrue(const Simd /* tag */, + const Mask128 mask) { + return PopCount(detail::BitsFromMask(mask)); +} + +template +HWY_API size_t FindKnownFirstTrue(const Simd /* tag */, + const Mask128 mask) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + return Num0BitsBelowLS1Bit_Nonzero64(mask_bits); +} + +template +HWY_API intptr_t FindFirstTrue(const Simd /* tag */, + const Mask128 mask) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; +} + +// ------------------------------ Compress, CompressBits + +namespace detail { + +// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. +template +HWY_INLINE Vec128 IndicesFromBits(Simd d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Rebind d8; + const Simd du; + + // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need + // byte indices for PSHUFB (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[2048] = { + // PrintCompress16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // + 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // + 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // + 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // + 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // + 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // + 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // + 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // + 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // + 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // + 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // + 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // + 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // + 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // + 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // + 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // + 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // + 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // + 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // + 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // + 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // + 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // + 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // + 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // + 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // + 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // + 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // + 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // + 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // + 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // + 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // + 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // + 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // + 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // + 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // + 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // + 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // + 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // + 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // + 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // + 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // + 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // + 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // + 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // + 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // + 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // + 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // + 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // + 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // + 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // + 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // + 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // + 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // + 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // + 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // + 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // + 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // + 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // + 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // + 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // + 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // + 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // + 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // + 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // + 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // + 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // + 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // + 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // + 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // + 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // + 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // + 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // + 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // + 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // + 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // + 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // + 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // + 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // + 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // + 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // + 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // + 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // + 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // + 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // + 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // + 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // + 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // + 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // + 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // + 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // + 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // + 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // + 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // + 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // + 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // + 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // + 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // + 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // + 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // + 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // + 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // + 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // + 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // + 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // + 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // + 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // + 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // + 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // + 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // + 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // + 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // + 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // + 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // + 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // + 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // + 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // + 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // + 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // + 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // + 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // + 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // + 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec128 IndicesFromNotBits(Simd d, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Rebind d8; + const Simd du; + + // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need + // byte indices for PSHUFB (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[2048] = { + // PrintCompressNot16x8Tables + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // + 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // + 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // + 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // + 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // + 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // + 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // + 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // + 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // + 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // + 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // + 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // + 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // + 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // + 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // + 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // + 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // + 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // + 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // + 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // + 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // + 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // + 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // + 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // + 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // + 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // + 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // + 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // + 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // + 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // + 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // + 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // + 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // + 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // + 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // + 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // + 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // + 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // + 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // + 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // + 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // + 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // + 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // + 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // + 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // + 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // + 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // + 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // + 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // + 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // + 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // + 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // + 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // + 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // + 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // + 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // + 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // + 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // + 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // + 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // + 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // + 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // + 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // + 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // + 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // + 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // + 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // + 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // + 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // + 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // + 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // + 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // + 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // + 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // + 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // + 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // + 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // + 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // + 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // + 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // + 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // + 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // + 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // + 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // + 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // + 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // + 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // + 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // + 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // + 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // + 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // + 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // + 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // + 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // + 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // + 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // + 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // + 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // + 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // + 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // + 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // + 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // + 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // + 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // + 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // + 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // + 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // + 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // + 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // + 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // + 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // + 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // + 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // + 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // + 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // + 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // + 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // + 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // + 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // + 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // + 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // + 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec128 IndicesFromBits(Simd d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[256] = { + // PrintCompress32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // + 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // + 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // + 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IndicesFromNotBits(Simd d, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[256] = { + // PrintCompressNot32x4Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, + 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15}; + + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IndicesFromBits(Simd d, uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[64] = { + // PrintCompress64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_INLINE Vec128 IndicesFromNotBits(Simd d, + uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. + alignas(16) constexpr uint8_t u8_indices[64] = { + // PrintCompressNot64x2Tables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + const Repartition d8; + return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); +} + +template +HWY_API Vec128 CompressBits(Vec128 v, uint64_t mask_bits) { + const Simd d; + const RebindToUnsigned du; + + HWY_DASSERT(mask_bits < (1ull << N)); + const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); + return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); +} + +template +HWY_API Vec128 CompressNotBits(Vec128 v, uint64_t mask_bits) { + const Simd d; + const RebindToUnsigned du; + + HWY_DASSERT(mask_bits < (1ull << N)); + const auto indices = BitCast(du, detail::IndicesFromNotBits(d, mask_bits)); + return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); +} + +} // namespace detail + +// Single lane: no-op +template +HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { + return v; +} + +// Two lanes: conditional swap +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. + const Full128 d; + const Vec128 m = VecFromMask(d, mask); + const Vec128 maskL = DupEven(m); + const Vec128 maskH = DupOdd(m); + const Vec128 swap = AndNot(maskL, maskH); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case +template +HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { + return detail::CompressBits(v, detail::BitsFromMask(mask)); +} + +// Single lane: no-op +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { + return v; +} + +// Two lanes: conditional swap +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. + const Full128 d; + const Vec128 m = VecFromMask(d, mask); + const Vec128 maskL = DupEven(m); + const Vec128 maskH = DupOdd(m); + const Vec128 swap = AndNot(maskH, maskL); + return IfVecThenElse(swap, Shuffle01(v), v); +} + +// General case +template +HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + // For partial vectors, we cannot pull the Not() into the table because + // BitsFromMask clears the upper bits. + if (N < 16 / sizeof(T)) { + return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); + } + return detail::CompressNotBits(v, detail::BitsFromMask(mask)); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec128 CompressBlocksNot(Vec128 v, + Mask128 /* m */) { + return v; +} + +template +HWY_API Vec128 CompressBits(Vec128 v, + const uint8_t* HWY_RESTRICT bits) { + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::CompressBits(v, mask_bits); +} + +// ------------------------------ CompressStore, CompressBitsStore + +template +HWY_API size_t CompressStore(Vec128 v, Mask128 m, Simd d, + T* HWY_RESTRICT unaligned) { + const RebindToUnsigned du; + + const uint64_t mask_bits = detail::BitsFromMask(m); + HWY_DASSERT(mask_bits < (1ull << N)); + const size_t count = PopCount(mask_bits); + + // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). + const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + StoreU(compressed, d, unaligned); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + + return count; +} + +template +HWY_API size_t CompressBlendedStore(Vec128 v, Mask128 m, + Simd d, + T* HWY_RESTRICT unaligned) { + const RebindToUnsigned du; + + const uint64_t mask_bits = detail::BitsFromMask(m); + HWY_DASSERT(mask_bits < (1ull << N)); + const size_t count = PopCount(mask_bits); + + // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). + const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + BlendedStore(compressed, FirstN(d, count), d, unaligned); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressBitsStore(Vec128 v, + const uint8_t* HWY_RESTRICT bits, + Simd d, T* HWY_RESTRICT unaligned) { + const RebindToUnsigned du; + + uint64_t mask_bits = 0; + constexpr size_t kNumBytes = (N + 7) / 8; + CopyBytes(bits, &mask_bits); + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + const size_t count = PopCount(mask_bits); + + // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). + const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); + const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); + StoreU(compressed, d, unaligned); + + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ StoreInterleaved2/3/4 + +// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in +// generic_ops-inl.h. + +// ------------------------------ Reductions + +namespace detail { + +// N=1 for any T: no-op +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} + +// u32/i32/f32: + +// N=2 +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return v10 + Shuffle2301(v10); +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Min(v10, Shuffle2301(v10)); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Max(v10, Shuffle2301(v10)); +} + +// N=4 (full) +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = v3210 + v1032; + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Min(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Max(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +// u64/i64/f64: + +// N=2 (full) +template +HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return v10 + v01; +} +template +HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Min(v10, v01); +} +template +HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, + const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Max(v10, v01); +} + +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} + +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec128 v) { + const Simd d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +} // namespace detail + +// Supported for u/i/f 32/64. Returns the same value in each lane. +template +HWY_API Vec128 SumOfLanes(Simd /* tag */, const Vec128 v) { + return detail::SumOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec128 MinOfLanes(Simd /* tag */, const Vec128 v) { + return detail::MinOfLanes(hwy::SizeTag(), v); +} +template +HWY_API Vec128 MaxOfLanes(Simd /* tag */, const Vec128 v) { + return detail::MaxOfLanes(hwy::SizeTag(), v); +} + +// ------------------------------ Lt128 + +namespace detail { + +// Returns vector-mask for Lt128. Also used by x86_256/x86_512. +template > +HWY_INLINE V Lt128Vec(const D d, const V a, const V b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + // Truth table of Eq and Lt for Hi and Lo u64. + // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) + // =H =L cH cL | out = cH | (=H & cL) + // 0 0 0 0 | 0 + // 0 0 0 1 | 0 + // 0 0 1 0 | 1 + // 0 0 1 1 | 1 + // 0 1 0 0 | 0 + // 0 1 0 1 | 0 + // 0 1 1 0 | 1 + // 1 0 0 0 | 0 + // 1 0 0 1 | 1 + // 1 1 0 0 | 0 + const auto eqHL = Eq(a, b); + const V ltHL = VecFromMask(d, Lt(a, b)); + const V ltLX = ShiftLeftLanes<1>(ltHL); + const V vecHx = IfThenElse(eqHL, ltLX, ltHL); + return InterleaveUpper(d, vecHx, vecHx); +} + +// Returns vector-mask for Eq128. Also used by x86_256/x86_512. +template > +HWY_INLINE V Eq128Vec(const D d, const V a, const V b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const auto eqHL = VecFromMask(d, Eq(a, b)); + const auto eqLH = Reverse2(d, eqHL); + return And(eqHL, eqLH); +} + +template > +HWY_INLINE V Ne128Vec(const D d, const V a, const V b) { + static_assert(!IsSigned>() && sizeof(TFromD) == 8, + "D must be u64"); + const auto neHL = VecFromMask(d, Ne(a, b)); + const auto neLH = Reverse2(d, neHL); + return Or(neHL, neLH); +} + +template > +HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) { + // No specialization required for AVX-512: Mask <-> Vec is fast, and + // copying mask bits to their neighbor seems infeasible. + const V ltHL = VecFromMask(d, Lt(a, b)); + return InterleaveUpper(d, ltHL, ltHL); +} + +template > +HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) { + // No specialization required for AVX-512: Mask <-> Vec is fast, and + // copying mask bits to their neighbor seems infeasible. + const V eqHL = VecFromMask(d, Eq(a, b)); + return InterleaveUpper(d, eqHL, eqHL); +} + +template > +HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b) { + // No specialization required for AVX-512: Mask <-> Vec is fast, and + // copying mask bits to their neighbor seems infeasible. + const V neHL = VecFromMask(d, Ne(a, b)); + return InterleaveUpper(d, neHL, neHL); +} + +} // namespace detail + +template > +HWY_API MFromD Lt128(D d, const V a, const V b) { + return MaskFromVec(detail::Lt128Vec(d, a, b)); +} + +template > +HWY_API MFromD Eq128(D d, const V a, const V b) { + return MaskFromVec(detail::Eq128Vec(d, a, b)); +} + +template > +HWY_API MFromD Ne128(D d, const V a, const V b) { + return MaskFromVec(detail::Ne128Vec(d, a, b)); +} + +template > +HWY_API MFromD Lt128Upper(D d, const V a, const V b) { + return MaskFromVec(detail::Lt128UpperVec(d, a, b)); +} + +template > +HWY_API MFromD Eq128Upper(D d, const V a, const V b) { + return MaskFromVec(detail::Eq128UpperVec(d, a, b)); +} + +template > +HWY_API MFromD Ne128Upper(D d, const V a, const V b) { + return MaskFromVec(detail::Ne128UpperVec(d, a, b)); +} + +// ------------------------------ Min128, Max128 (Lt128) + +// Avoids the extra MaskFromVec in Lt128. +template > +HWY_API V Min128(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); +} + +template > +HWY_API V Max128(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); +} + +template > +HWY_API V Min128Upper(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); +} + +template > +HWY_API V Max128Upper(D d, const V a, const V b) { + return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - +// the warning seems to be issued at the call site of intrinsics, i.e. our code. +HWY_DIAGNOSTICS(pop) diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h new file mode 100644 index 0000000..12a83cb --- /dev/null +++ b/hwy/ops/x86_256-inl.h @@ -0,0 +1,5619 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when +// compiling for that target. +// External include guard in highway.h - see comment there. + +// WARNING: most operations do not cross 128-bit block boundaries. In +// particular, "Broadcast", pack and zip behavior may be surprising. + +// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL +#include "hwy/base.h" + +// Avoid uninitialized warnings in GCC's avx512fintrin.h - see +// https://github.com/google/highway/issues/710) +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") +HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized") +#endif + +// Must come before HWY_COMPILER_CLANGCL +#include // AVX2+ + +#if HWY_COMPILER_CLANGCL +// Including should be enough, but Clang's headers helpfully skip +// including these headers when _MSC_VER is defined, like when using clang-cl. +// Include these directly here. +#include +// avxintrin defines __m256i and must come before avx2intrin. +#include +#include // _pext_u64 +#include +#include +#include +#endif // HWY_COMPILER_CLANGCL + +#include +#include +#include // memcpy + +#if HWY_IS_MSAN +#include +#endif + +// For half-width vectors. Already includes base.h and shared-inl.h. +#include "hwy/ops/x86_128-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +template +struct Raw256 { + using type = __m256i; +}; +template <> +struct Raw256 { + using type = __m256; +}; +template <> +struct Raw256 { + using type = __m256d; +}; + +} // namespace detail + +template +class Vec256 { + using Raw = typename detail::Raw256::type; + + public: + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec256& operator*=(const Vec256 other) { + return *this = (*this * other); + } + HWY_INLINE Vec256& operator/=(const Vec256 other) { + return *this = (*this / other); + } + HWY_INLINE Vec256& operator+=(const Vec256 other) { + return *this = (*this + other); + } + HWY_INLINE Vec256& operator-=(const Vec256 other) { + return *this = (*this - other); + } + HWY_INLINE Vec256& operator&=(const Vec256 other) { + return *this = (*this & other); + } + HWY_INLINE Vec256& operator|=(const Vec256 other) { + return *this = (*this | other); + } + HWY_INLINE Vec256& operator^=(const Vec256 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +#if HWY_TARGET <= HWY_AVX3 + +namespace detail { + +// Template arg: sizeof(lane type) +template +struct RawMask256 {}; +template <> +struct RawMask256<1> { + using type = __mmask32; +}; +template <> +struct RawMask256<2> { + using type = __mmask16; +}; +template <> +struct RawMask256<4> { + using type = __mmask8; +}; +template <> +struct RawMask256<8> { + using type = __mmask8; +}; + +} // namespace detail + +template +struct Mask256 { + using Raw = typename detail::RawMask256::type; + + static Mask256 FromBits(uint64_t mask_bits) { + return Mask256{static_cast(mask_bits)}; + } + + Raw raw; +}; + +#else // AVX2 + +// FF..FF or 0. +template +struct Mask256 { + typename detail::Raw256::type raw; +}; + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; } +HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); } +HWY_INLINE __m256i BitCastToInteger(__m256d v) { + return _mm256_castpd_si256(v); +} + +template +HWY_INLINE Vec256 BitCastToByte(Vec256 v) { + return Vec256{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template +struct BitCastFromInteger256 { + HWY_INLINE __m256i operator()(__m256i v) { return v; } +}; +template <> +struct BitCastFromInteger256 { + HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); } +}; +template <> +struct BitCastFromInteger256 { + HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); } +}; + +template +HWY_INLINE Vec256 BitCastFromByte(Full256 /* tag */, Vec256 v) { + return Vec256{BitCastFromInteger256()(v.raw)}; +} + +} // namespace detail + +template +HWY_API Vec256 BitCast(Full256 d, Vec256 v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Set + +// Returns an all-zero vector. +template +HWY_API Vec256 Zero(Full256 /* tag */) { + return Vec256{_mm256_setzero_si256()}; +} +HWY_API Vec256 Zero(Full256 /* tag */) { + return Vec256{_mm256_setzero_ps()}; +} +HWY_API Vec256 Zero(Full256 /* tag */) { + return Vec256{_mm256_setzero_pd()}; +} + +// Returns a vector with all lanes set to "t". +HWY_API Vec256 Set(Full256 /* tag */, const uint8_t t) { + return Vec256{_mm256_set1_epi8(static_cast(t))}; // NOLINT +} +HWY_API Vec256 Set(Full256 /* tag */, const uint16_t t) { + return Vec256{_mm256_set1_epi16(static_cast(t))}; // NOLINT +} +HWY_API Vec256 Set(Full256 /* tag */, const uint32_t t) { + return Vec256{_mm256_set1_epi32(static_cast(t))}; +} +HWY_API Vec256 Set(Full256 /* tag */, const uint64_t t) { + return Vec256{ + _mm256_set1_epi64x(static_cast(t))}; // NOLINT +} +HWY_API Vec256 Set(Full256 /* tag */, const int8_t t) { + return Vec256{_mm256_set1_epi8(static_cast(t))}; // NOLINT +} +HWY_API Vec256 Set(Full256 /* tag */, const int16_t t) { + return Vec256{_mm256_set1_epi16(static_cast(t))}; // NOLINT +} +HWY_API Vec256 Set(Full256 /* tag */, const int32_t t) { + return Vec256{_mm256_set1_epi32(t)}; +} +HWY_API Vec256 Set(Full256 /* tag */, const int64_t t) { + return Vec256{ + _mm256_set1_epi64x(static_cast(t))}; // NOLINT +} +HWY_API Vec256 Set(Full256 /* tag */, const float t) { + return Vec256{_mm256_set1_ps(t)}; +} +HWY_API Vec256 Set(Full256 /* tag */, const double t) { + return Vec256{_mm256_set1_pd(t)}; +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template +HWY_API Vec256 Undefined(Full256 /* tag */) { + // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC + // generate an XOR instruction. + return Vec256{_mm256_undefined_si256()}; +} +HWY_API Vec256 Undefined(Full256 /* tag */) { + return Vec256{_mm256_undefined_ps()}; +} +HWY_API Vec256 Undefined(Full256 /* tag */) { + return Vec256{_mm256_undefined_pd()}; +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== LOGICAL + +// ------------------------------ And + +template +HWY_API Vec256 And(Vec256 a, Vec256 b) { + return Vec256{_mm256_and_si256(a.raw, b.raw)}; +} + +HWY_API Vec256 And(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_and_ps(a.raw, b.raw)}; +} +HWY_API Vec256 And(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_and_pd(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template +HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { + return Vec256{_mm256_andnot_si256(not_mask.raw, mask.raw)}; +} +HWY_API Vec256 AndNot(const Vec256 not_mask, + const Vec256 mask) { + return Vec256{_mm256_andnot_ps(not_mask.raw, mask.raw)}; +} +HWY_API Vec256 AndNot(const Vec256 not_mask, + const Vec256 mask) { + return Vec256{_mm256_andnot_pd(not_mask.raw, mask.raw)}; +} + +// ------------------------------ Or + +template +HWY_API Vec256 Or(Vec256 a, Vec256 b) { + return Vec256{_mm256_or_si256(a.raw, b.raw)}; +} + +HWY_API Vec256 Or(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_or_ps(a.raw, b.raw)}; +} +HWY_API Vec256 Or(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_or_pd(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template +HWY_API Vec256 Xor(Vec256 a, Vec256 b) { + return Vec256{_mm256_xor_si256(a.raw, b.raw)}; +} + +HWY_API Vec256 Xor(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_xor_ps(a.raw, b.raw)}; +} +HWY_API Vec256 Xor(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_xor_pd(a.raw, b.raw)}; +} + +// ------------------------------ Not + +template +HWY_API Vec256 Not(const Vec256 v) { + using TU = MakeUnsigned; +#if HWY_TARGET <= HWY_AVX3 + const __m256i vu = BitCast(Full256(), v).raw; + return BitCast(Full256(), + Vec256{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)}); +#else + return Xor(v, BitCast(Full256(), Vec256{_mm256_set1_epi32(-1)})); +#endif +} + +// ------------------------------ Or3 + +template +HWY_API Vec256 Or3(Vec256 o1, Vec256 o2, Vec256 o3) { +#if HWY_TARGET <= HWY_AVX3 + const Full256 d; + const RebindToUnsigned du; + using VU = VFromD; + const __m256i ret = _mm256_ternarylogic_epi64( + BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); + return BitCast(d, VU{ret}); +#else + return Or(o1, Or(o2, o3)); +#endif +} + +// ------------------------------ OrAnd + +template +HWY_API Vec256 OrAnd(Vec256 o, Vec256 a1, Vec256 a2) { +#if HWY_TARGET <= HWY_AVX3 + const Full256 d; + const RebindToUnsigned du; + using VU = VFromD; + const __m256i ret = _mm256_ternarylogic_epi64( + BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); + return BitCast(d, VU{ret}); +#else + return Or(o, And(a1, a2)); +#endif +} + +// ------------------------------ IfVecThenElse + +template +HWY_API Vec256 IfVecThenElse(Vec256 mask, Vec256 yes, Vec256 no) { +#if HWY_TARGET <= HWY_AVX3 + const Full256 d; + const RebindToUnsigned du; + using VU = VFromD; + return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw, + BitCast(du, yes).raw, + BitCast(du, no).raw, 0xCA)}); +#else + return IfThenElse(MaskFromVec(mask), yes, no); +#endif +} + +// ------------------------------ Operator overloads (internal-only if float) + +template +HWY_API Vec256 operator&(const Vec256 a, const Vec256 b) { + return And(a, b); +} + +template +HWY_API Vec256 operator|(const Vec256 a, const Vec256 b) { + return Or(a, b); +} + +template +HWY_API Vec256 operator^(const Vec256 a, const Vec256 b) { + return Xor(a, b); +} + +// ------------------------------ PopulationCount + +// 8/16 require BITALG, 32/64 require VPOPCNTDQ. +#if HWY_TARGET == HWY_AVX3_DL + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template +HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<1> /* tag */, Vec256 v) { + return Vec256{_mm256_popcnt_epi8(v.raw)}; +} +template +HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<2> /* tag */, Vec256 v) { + return Vec256{_mm256_popcnt_epi16(v.raw)}; +} +template +HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<4> /* tag */, Vec256 v) { + return Vec256{_mm256_popcnt_epi32(v.raw)}; +} +template +HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<8> /* tag */, Vec256 v) { + return Vec256{_mm256_popcnt_epi64(v.raw)}; +} + +} // namespace detail + +template +HWY_API Vec256 PopulationCount(Vec256 v) { + return detail::PopulationCount(hwy::SizeTag(), v); +} + +#endif // HWY_TARGET == HWY_AVX3_DL + +// ================================================== SIGN + +// ------------------------------ CopySign + +template +HWY_API Vec256 CopySign(const Vec256 magn, const Vec256 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + + const Full256 d; + const auto msb = SignBit(d); + +#if HWY_TARGET <= HWY_AVX3 + const Rebind, decltype(d)> du; + // Truth table for msb, magn, sign | bitwise msb ? sign : mag + // 0 0 0 | 0 + // 0 0 1 | 0 + // 0 1 0 | 1 + // 0 1 1 | 1 + // 1 0 0 | 0 + // 1 0 1 | 1 + // 1 1 0 | 0 + // 1 1 1 | 1 + // The lane size does not matter because we are not using predication. + const __m256i out = _mm256_ternarylogic_epi32( + BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC); + return BitCast(d, decltype(Zero(du)){out}); +#else + return Or(AndNot(msb, magn), And(msb, sign)); +#endif +} + +template +HWY_API Vec256 CopySignToAbs(const Vec256 abs, const Vec256 sign) { +#if HWY_TARGET <= HWY_AVX3 + // AVX3 can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +#else + return Or(abs, And(SignBit(Full256()), sign)); +#endif +} + +// ================================================== MASK + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ IfThenElse + +// Returns mask ? b : a. + +namespace detail { + +// Templates for signed/unsigned integer of a particular size. +template +HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<1> /* tag */, Mask256 mask, + Vec256 yes, Vec256 no) { + return Vec256{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<2> /* tag */, Mask256 mask, + Vec256 yes, Vec256 no) { + return Vec256{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<4> /* tag */, Mask256 mask, + Vec256 yes, Vec256 no) { + return Vec256{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<8> /* tag */, Mask256 mask, + Vec256 yes, Vec256 no) { + return Vec256{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)}; +} + +} // namespace detail + +template +HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { + return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); +} +HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, + Vec256 no) { + return Vec256{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)}; +} +HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, + Vec256 no) { + return Vec256{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)}; +} + +namespace detail { + +template +HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<1> /* tag */, Mask256 mask, + Vec256 yes) { + return Vec256{_mm256_maskz_mov_epi8(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<2> /* tag */, Mask256 mask, + Vec256 yes) { + return Vec256{_mm256_maskz_mov_epi16(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<4> /* tag */, Mask256 mask, + Vec256 yes) { + return Vec256{_mm256_maskz_mov_epi32(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256 mask, + Vec256 yes) { + return Vec256{_mm256_maskz_mov_epi64(mask.raw, yes.raw)}; +} + +} // namespace detail + +template +HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { + return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); +} +HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { + return Vec256{_mm256_maskz_mov_ps(mask.raw, yes.raw)}; +} +HWY_API Vec256 IfThenElseZero(Mask256 mask, + Vec256 yes) { + return Vec256{_mm256_maskz_mov_pd(mask.raw, yes.raw)}; +} + +namespace detail { + +template +HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<1> /* tag */, Mask256 mask, + Vec256 no) { + // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. + return Vec256{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<2> /* tag */, Mask256 mask, + Vec256 no) { + return Vec256{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<4> /* tag */, Mask256 mask, + Vec256 no) { + return Vec256{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256 mask, + Vec256 no) { + return Vec256{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; +} + +} // namespace detail + +template +HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { + return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); +} +HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { + return Vec256{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; +} +HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { + return Vec256{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; +} + +template +HWY_API Vec256 ZeroIfNegative(const Vec256 v) { + static_assert(IsSigned(), "Only for float"); + // AVX3 MaskFromVec only looks at the MSB + return IfThenZeroElse(MaskFromVec(v), v); +} + +// ------------------------------ Mask logical + +namespace detail { + +template +HWY_INLINE Mask256 And(hwy::SizeTag<1> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kand_mask32(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask32>(a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask256 And(hwy::SizeTag<2> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kand_mask16(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask16>(a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask256 And(hwy::SizeTag<4> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kand_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask256 And(hwy::SizeTag<8> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kand_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(a.raw & b.raw)}; +#endif +} + +template +HWY_INLINE Mask256 AndNot(hwy::SizeTag<1> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kandn_mask32(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask32>(~a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask256 AndNot(hwy::SizeTag<2> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kandn_mask16(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask16>(~a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask256 AndNot(hwy::SizeTag<4> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask256 AndNot(hwy::SizeTag<8> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(~a.raw & b.raw)}; +#endif +} + +template +HWY_INLINE Mask256 Or(hwy::SizeTag<1> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kor_mask32(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask32>(a.raw | b.raw)}; +#endif +} +template +HWY_INLINE Mask256 Or(hwy::SizeTag<2> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kor_mask16(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask16>(a.raw | b.raw)}; +#endif +} +template +HWY_INLINE Mask256 Or(hwy::SizeTag<4> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kor_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} +template +HWY_INLINE Mask256 Or(hwy::SizeTag<8> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kor_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(a.raw | b.raw)}; +#endif +} + +template +HWY_INLINE Mask256 Xor(hwy::SizeTag<1> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kxor_mask32(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask32>(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask256 Xor(hwy::SizeTag<2> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kxor_mask16(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask16>(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask256 Xor(hwy::SizeTag<4> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask256 Xor(hwy::SizeTag<8> /*tag*/, const Mask256 a, + const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(a.raw ^ b.raw)}; +#endif +} + +template +HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, + const Mask256 a, const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kxnor_mask32(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)}; +#endif +} +template +HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, + const Mask256 a, const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kxnor_mask16(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; +#endif +} +template +HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, + const Mask256 a, const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{_kxnor_mask8(a.raw, b.raw)}; +#else + return Mask256{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; +#endif +} +template +HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, + const Mask256 a, const Mask256 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask256{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; +#else + return Mask256{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; +#endif +} + +} // namespace detail + +template +HWY_API Mask256 And(const Mask256 a, Mask256 b) { + return detail::And(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { + return detail::AndNot(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask256 Or(const Mask256 a, Mask256 b) { + return detail::Or(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { + return detail::Xor(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask256 Not(const Mask256 m) { + // Flip only the valid bits. + constexpr size_t N = 32 / sizeof(T); + return Xor(m, Mask256::FromBits((1ull << N) - 1)); +} + +template +HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { + return detail::ExclusiveNeither(hwy::SizeTag(), a, b); +} + +#else // AVX2 + +// ------------------------------ Mask + +// Mask and Vec are the same (true = FF..FF). +template +HWY_API Mask256 MaskFromVec(const Vec256 v) { + return Mask256{v.raw}; +} + +template +HWY_API Vec256 VecFromMask(const Mask256 v) { + return Vec256{v.raw}; +} + +template +HWY_API Vec256 VecFromMask(Full256 /* tag */, const Mask256 v) { + return Vec256{v.raw}; +} + +// ------------------------------ IfThenElse + +// mask ? yes : no +template +HWY_API Vec256 IfThenElse(const Mask256 mask, const Vec256 yes, + const Vec256 no) { + return Vec256{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)}; +} +HWY_API Vec256 IfThenElse(const Mask256 mask, + const Vec256 yes, + const Vec256 no) { + return Vec256{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)}; +} +HWY_API Vec256 IfThenElse(const Mask256 mask, + const Vec256 yes, + const Vec256 no) { + return Vec256{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)}; +} + +// mask ? yes : 0 +template +HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { + return yes & VecFromMask(Full256(), mask); +} + +// mask ? 0 : no +template +HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { + return AndNot(VecFromMask(Full256(), mask), no); +} + +template +HWY_API Vec256 ZeroIfNegative(Vec256 v) { + static_assert(IsSigned(), "Only for float"); + const auto zero = Zero(Full256()); + // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes + return IfThenElse(MaskFromVec(v), zero, v); +} + +// ------------------------------ Mask logical + +template +HWY_API Mask256 Not(const Mask256 m) { + return MaskFromVec(Not(VecFromMask(Full256(), m))); +} + +template +HWY_API Mask256 And(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 Or(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); +} + +template +HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { + const Full256 d; + return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ================================================== COMPARE + +#if HWY_TARGET <= HWY_AVX3 + +// Comparisons set a mask bit to 1 if the condition is true, else 0. + +template +HWY_API Mask256 RebindMask(Full256 /*tag*/, Mask256 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask256{m.raw}; +} + +namespace detail { + +template +HWY_INLINE Mask256 TestBit(hwy::SizeTag<1> /*tag*/, const Vec256 v, + const Vec256 bit) { + return Mask256{_mm256_test_epi8_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask256 TestBit(hwy::SizeTag<2> /*tag*/, const Vec256 v, + const Vec256 bit) { + return Mask256{_mm256_test_epi16_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask256 TestBit(hwy::SizeTag<4> /*tag*/, const Vec256 v, + const Vec256 bit) { + return Mask256{_mm256_test_epi32_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask256 TestBit(hwy::SizeTag<8> /*tag*/, const Vec256 v, + const Vec256 bit) { + return Mask256{_mm256_test_epi64_mask(v.raw, bit.raw)}; +} + +} // namespace detail + +template +HWY_API Mask256 TestBit(const Vec256 v, const Vec256 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return detail::TestBit(hwy::SizeTag(), v, bit); +} + +// ------------------------------ Equality + +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi8_mask(a.raw, b.raw)}; +} +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi16_mask(a.raw, b.raw)}; +} +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi32_mask(a.raw, b.raw)}; +} +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256 operator==(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +HWY_API Mask256 operator==(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template +HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpneq_epi8_mask(a.raw, b.raw)}; +} +template +HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpneq_epi16_mask(a.raw, b.raw)}; +} +template +HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpneq_epi32_mask(a.raw, b.raw)}; +} +template +HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpneq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmpgt_epi8_mask(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmpgt_epi16_mask(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmpgt_epi32_mask(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmpgt_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmpgt_epu8_mask(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmpgt_epu16_mask(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmpgt_epu32_mask(a.raw, b.raw)}; +} +HWY_API Mask256 operator>(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmpgt_epu64_mask(a.raw, b.raw)}; +} + +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} + +// ------------------------------ Weak inequality + +HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} +HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { + return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} + +// ------------------------------ Mask + +namespace detail { + +template +HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec256 v) { + return Mask256{_mm256_movepi8_mask(v.raw)}; +} +template +HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec256 v) { + return Mask256{_mm256_movepi16_mask(v.raw)}; +} +template +HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec256 v) { + return Mask256{_mm256_movepi32_mask(v.raw)}; +} +template +HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec256 v) { + return Mask256{_mm256_movepi64_mask(v.raw)}; +} + +} // namespace detail + +template +HWY_API Mask256 MaskFromVec(const Vec256 v) { + return detail::MaskFromVec(hwy::SizeTag(), v); +} +// There do not seem to be native floating-point versions of these instructions. +HWY_API Mask256 MaskFromVec(const Vec256 v) { + return Mask256{MaskFromVec(BitCast(Full256(), v)).raw}; +} +HWY_API Mask256 MaskFromVec(const Vec256 v) { + return Mask256{MaskFromVec(BitCast(Full256(), v)).raw}; +} + +template +HWY_API Vec256 VecFromMask(const Mask256 v) { + return Vec256{_mm256_movm_epi8(v.raw)}; +} + +template +HWY_API Vec256 VecFromMask(const Mask256 v) { + return Vec256{_mm256_movm_epi16(v.raw)}; +} + +template +HWY_API Vec256 VecFromMask(const Mask256 v) { + return Vec256{_mm256_movm_epi32(v.raw)}; +} + +template +HWY_API Vec256 VecFromMask(const Mask256 v) { + return Vec256{_mm256_movm_epi64(v.raw)}; +} + +HWY_API Vec256 VecFromMask(const Mask256 v) { + return Vec256{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))}; +} + +HWY_API Vec256 VecFromMask(const Mask256 v) { + return Vec256{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))}; +} + +template +HWY_API Vec256 VecFromMask(Full256 /* tag */, const Mask256 v) { + return VecFromMask(v); +} + +#else // AVX2 + +// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +template +HWY_API Mask256 RebindMask(Full256 d_to, Mask256 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return MaskFromVec(BitCast(d_to, VecFromMask(Full256(), m))); +} + +template +HWY_API Mask256 TestBit(const Vec256 v, const Vec256 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return (v & bit) == bit; +} + +// ------------------------------ Equality + +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi8(a.raw, b.raw)}; +} + +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi16(a.raw, b.raw)}; +} + +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi32(a.raw, b.raw)}; +} + +template +HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { + return Mask256{_mm256_cmpeq_epi64(a.raw, b.raw)}; +} + +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +HWY_API Mask256 operator==(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template +HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { + return Not(a == b); +} +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)}; +} +HWY_API Mask256 operator!=(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +// Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8 +// to perform an unsigned comparison instead of the intended signed. Workaround +// is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy +#if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930 +#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1 +#else +#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0 +#endif + +HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, + Vec256 b) { +#if HWY_AVX2_GCC_CMPGT8_WORKAROUND + using i8x32 = signed char __attribute__((__vector_size__(32))); + return Mask256{static_cast<__m256i>(reinterpret_cast(a.raw) > + reinterpret_cast(b.raw))}; +#else + return Mask256{_mm256_cmpgt_epi8(a.raw, b.raw)}; +#endif +} +HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, + Vec256 b) { + return Mask256{_mm256_cmpgt_epi16(a.raw, b.raw)}; +} +HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, + Vec256 b) { + return Mask256{_mm256_cmpgt_epi32(a.raw, b.raw)}; +} +HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, + Vec256 b) { + return Mask256{_mm256_cmpgt_epi64(a.raw, b.raw)}; +} + +template +HWY_INLINE Mask256 Gt(hwy::UnsignedTag /*tag*/, Vec256 a, Vec256 b) { + const Full256 du; + const RebindToSigned di; + const Vec256 msb = Set(du, (LimitsMax() >> 1) + 1); + return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb))); +} + +HWY_API Mask256 Gt(hwy::FloatTag /*tag*/, Vec256 a, + Vec256 b) { + return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)}; +} +HWY_API Mask256 Gt(hwy::FloatTag /*tag*/, Vec256 a, + Vec256 b) { + return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)}; +} + +} // namespace detail + +template +HWY_API Mask256 operator>(Vec256 a, Vec256 b) { + return detail::Gt(hwy::TypeTag(), a, b); +} + +// ------------------------------ Weak inequality + +HWY_API Mask256 operator>=(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)}; +} +HWY_API Mask256 operator>=(const Vec256 a, + const Vec256 b) { + return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)}; +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ Reversed comparisons + +template +HWY_API Mask256 operator<(const Vec256 a, const Vec256 b) { + return b > a; +} + +template +HWY_API Mask256 operator<=(const Vec256 a, const Vec256 b) { + return b >= a; +} + +// ------------------------------ Min (Gt, IfThenElse) + +// Unsigned +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_min_epu8(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_min_epu16(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_min_epu32(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, + const Vec256 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_min_epu64(a.raw, b.raw)}; +#else + const Full256 du; + const Full256 di; + const auto msb = Set(du, 1ull << 63); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, b, a); +#endif +} + +// Signed +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_min_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_min_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_min_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_min_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, a, b); +#endif +} + +// Float +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_min_ps(a.raw, b.raw)}; +} +HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_min_pd(a.raw, b.raw)}; +} + +// ------------------------------ Max (Gt, IfThenElse) + +// Unsigned +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_max_epu8(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_max_epu16(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_max_epu32(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, + const Vec256 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_max_epu64(a.raw, b.raw)}; +#else + const Full256 du; + const Full256 di; + const auto msb = Set(du, 1ull << 63); + const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); + return IfThenElse(gt, a, b); +#endif +} + +// Signed +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_max_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_max_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_max_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_max_epi64(a.raw, b.raw)}; +#else + return IfThenElse(a < b, b, a); +#endif +} + +// Float +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_max_ps(a.raw, b.raw)}; +} +HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_max_pd(a.raw, b.raw)}; +} + +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask256 FirstN(const Full256 d, size_t n) { +#if HWY_TARGET <= HWY_AVX3 + (void)d; + constexpr size_t N = 32 / sizeof(T); +#if HWY_ARCH_X86_64 + const uint64_t all = (1ull << N) - 1; + // BZHI only looks at the lower 8 bits of n! + return Mask256::FromBits((n > 255) ? all : _bzhi_u64(all, n)); +#else + const uint32_t all = static_cast((1ull << N) - 1); + // BZHI only looks at the lower 8 bits of n! + return Mask256::FromBits( + (n > 255) ? all : _bzhi_u32(all, static_cast(n))); +#endif // HWY_ARCH_X86_64 +#else + const RebindToSigned di; // Signed comparisons are cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(n))); +#endif +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_add_ps(a.raw, b.raw)}; +} +HWY_API Vec256 operator+(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_add_pd(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_sub_ps(a.raw, b.raw)}; +} +HWY_API Vec256 operator-(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_sub_pd(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +HWY_API Vec256 SumsOf8(const Vec256 v) { + return Vec256{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_adds_epu8(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_adds_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_adds_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedAdd(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_adds_epi16(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_subs_epu8(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_subs_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_subs_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 SaturatedSub(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_subs_epi16(a.raw, b.raw)}; +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +// Unsigned +HWY_API Vec256 AverageRound(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_avg_epu8(a.raw, b.raw)}; +} +HWY_API Vec256 AverageRound(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_avg_epu16(a.raw, b.raw)}; +} + +// ------------------------------ Abs (Sub) + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +HWY_API Vec256 Abs(const Vec256 v) { +#if HWY_COMPILER_MSVC + // Workaround for incorrect codegen? (wrong result) + const auto zero = Zero(Full256()); + return Vec256{_mm256_max_epi8(v.raw, (zero - v).raw)}; +#else + return Vec256{_mm256_abs_epi8(v.raw)}; +#endif +} +HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{_mm256_abs_epi16(v.raw)}; +} +HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{_mm256_abs_epi32(v.raw)}; +} +// i64 is implemented after BroadcastSignBit. + +HWY_API Vec256 Abs(const Vec256 v) { + const Vec256 mask{_mm256_set1_epi32(0x7FFFFFFF)}; + return v & BitCast(Full256(), mask); +} +HWY_API Vec256 Abs(const Vec256 v) { + const Vec256 mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)}; + return v & BitCast(Full256(), mask); +} + +// ------------------------------ Integer multiplication + +// Unsigned +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; +} + +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec256 MulHigh(Vec256 a, Vec256 b) { + return Vec256{_mm256_mulhi_epu16(a.raw, b.raw)}; +} +HWY_API Vec256 MulHigh(Vec256 a, Vec256 b) { + return Vec256{_mm256_mulhi_epi16(a.raw, b.raw)}; +} + +HWY_API Vec256 MulFixedPoint15(Vec256 a, Vec256 b) { + return Vec256{_mm256_mulhrs_epi16(a.raw, b.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +HWY_API Vec256 MulEven(Vec256 a, Vec256 b) { + return Vec256{_mm256_mul_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 MulEven(Vec256 a, Vec256 b) { + return Vec256{_mm256_mul_epu32(a.raw, b.raw)}; +} + +// ------------------------------ ShiftLeft + +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{_mm256_slli_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{_mm256_slli_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{_mm256_slli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{_mm256_slli_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{_mm256_slli_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + return Vec256{_mm256_slli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + const Full256 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + +// ------------------------------ ShiftRight + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{_mm256_srli_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{_mm256_srli_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{_mm256_srli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + const Full256 d8; + // Use raw instead of BitCast to support N=1. + const Vec256 shifted{ShiftRight(Vec256{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{_mm256_srai_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{_mm256_srai_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + const Full256 di; + const Full256 du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// i64 is implemented after BroadcastSignBit. + +// ------------------------------ RotateRight + +template +HWY_API Vec256 RotateRight(const Vec256 v) { + static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_ror_epi32(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +#endif +} + +template +HWY_API Vec256 RotateRight(const Vec256 v) { + static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_ror_epi64(v.raw, kBits)}; +#else + if (kBits == 0) return v; + return Or(ShiftRight(v), ShiftLeft(v)); +#endif +} + +// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) + +HWY_API Vec256 BroadcastSignBit(const Vec256 v) { + return VecFromMask(v < Zero(Full256())); +} + +HWY_API Vec256 BroadcastSignBit(const Vec256 v) { + return ShiftRight<15>(v); +} + +HWY_API Vec256 BroadcastSignBit(const Vec256 v) { + return ShiftRight<31>(v); +} + +HWY_API Vec256 BroadcastSignBit(const Vec256 v) { +#if HWY_TARGET == HWY_AVX2 + return VecFromMask(v < Zero(Full256())); +#else + return Vec256{_mm256_srai_epi64(v.raw, 63)}; +#endif +} + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_srai_epi64(v.raw, kBits)}; +#else + const Full256 di; + const Full256 du; + const auto right = BitCast(di, ShiftRight(BitCast(du, v))); + const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); + return right | sign; +#endif +} + +HWY_API Vec256 Abs(const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_abs_epi64(v.raw)}; +#else + const auto zero = Zero(Full256()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +// ------------------------------ IfNegativeThenElse (BroadcastSignBit) +HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, + Vec256 no) { + // int8: AVX2 IfThenElse only looks at the MSB. + return IfThenElse(MaskFromVec(v), yes, no); +} + +template +HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { + static_assert(IsSigned(), "Only works for signed/float"); + const Full256 d; + const RebindToSigned di; + + // 16-bit: no native blendv, so copy sign to lower byte's MSB. + v = BitCast(d, BroadcastSignBit(BitCast(di, v))); + return IfThenElse(MaskFromVec(v), yes, no); +} + +template +HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { + static_assert(IsSigned(), "Only works for signed/float"); + const Full256 d; + const RebindToFloat df; + + // 32/64-bit: use float IfThenElse, which only looks at the MSB. + const MFromD msb = MaskFromVec(BitCast(df, v)); + return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no))); +} + +// ------------------------------ ShiftLeftSame + +HWY_API Vec256 ShiftLeftSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256 ShiftLeftSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256 ShiftLeftSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + return Vec256{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + return Vec256{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + const Full256 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); +} + +// ------------------------------ ShiftRightSame (BroadcastSignBit) + +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { + const Full256 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast(0xFF >> bits)); +} + +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +#else + const Full256 di; + const Full256 du; + const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); + return right | sign; +#endif +} + +HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { + const Full256 di; + const Full256 du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = + BitCast(di, Set(du, static_cast(0x80 >> bits))); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ Neg (Xor, Sub) + +// Tag dispatch instead of SFINAE for MSVC 2017 compatibility +namespace detail { + +template +HWY_INLINE Vec256 Neg(hwy::FloatTag /*tag*/, const Vec256 v) { + return Xor(v, SignBit(Full256())); +} + +// Not floating-point +template +HWY_INLINE Vec256 Neg(hwy::NonFloatTag /*tag*/, const Vec256 v) { + return Zero(Full256()) - v; +} + +} // namespace detail + +template +HWY_API Vec256 Neg(const Vec256 v) { + return detail::Neg(hwy::IsFloatTag(), v); +} + +// ------------------------------ Floating-point mul / div + +HWY_API Vec256 operator*(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_mul_ps(a.raw, b.raw)}; +} +HWY_API Vec256 operator*(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_mul_pd(a.raw, b.raw)}; +} + +HWY_API Vec256 operator/(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_div_ps(a.raw, b.raw)}; +} +HWY_API Vec256 operator/(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_div_pd(a.raw, b.raw)}; +} + +// Approximate reciprocal +HWY_API Vec256 ApproximateReciprocal(const Vec256 v) { + return Vec256{_mm256_rcp_ps(v.raw)}; +} + +// Absolute value of difference. +HWY_API Vec256 AbsDiff(const Vec256 a, const Vec256 b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +HWY_API Vec256 MulAdd(const Vec256 mul, const Vec256 x, + const Vec256 add) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x + add; +#else + return Vec256{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +HWY_API Vec256 MulAdd(const Vec256 mul, const Vec256 x, + const Vec256 add) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x + add; +#else + return Vec256{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns add - mul * x +HWY_API Vec256 NegMulAdd(const Vec256 mul, const Vec256 x, + const Vec256 add) { +#ifdef HWY_DISABLE_BMI2_FMA + return add - mul * x; +#else + return Vec256{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)}; +#endif +} +HWY_API Vec256 NegMulAdd(const Vec256 mul, + const Vec256 x, + const Vec256 add) { +#ifdef HWY_DISABLE_BMI2_FMA + return add - mul * x; +#else + return Vec256{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)}; +#endif +} + +// Returns mul * x - sub +HWY_API Vec256 MulSub(const Vec256 mul, const Vec256 x, + const Vec256 sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x - sub; +#else + return Vec256{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +HWY_API Vec256 MulSub(const Vec256 mul, const Vec256 x, + const Vec256 sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return mul * x - sub; +#else + return Vec256{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// Returns -mul * x - sub +HWY_API Vec256 NegMulSub(const Vec256 mul, const Vec256 x, + const Vec256 sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return Neg(mul * x) - sub; +#else + return Vec256{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)}; +#endif +} +HWY_API Vec256 NegMulSub(const Vec256 mul, + const Vec256 x, + const Vec256 sub) { +#ifdef HWY_DISABLE_BMI2_FMA + return Neg(mul * x) - sub; +#else + return Vec256{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)}; +#endif +} + +// ------------------------------ Floating-point square root + +// Full precision square root +HWY_API Vec256 Sqrt(const Vec256 v) { + return Vec256{_mm256_sqrt_ps(v.raw)}; +} +HWY_API Vec256 Sqrt(const Vec256 v) { + return Vec256{_mm256_sqrt_pd(v.raw)}; +} + +// Approximate reciprocal square root +HWY_API Vec256 ApproximateReciprocalSqrt(const Vec256 v) { + return Vec256{_mm256_rsqrt_ps(v.raw)}; +} + +// ------------------------------ Floating-point rounding + +// Toward nearest integer, tie to even +HWY_API Vec256 Round(const Vec256 v) { + return Vec256{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256 Round(const Vec256 v) { + return Vec256{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} + +// Toward zero, aka truncate +HWY_API Vec256 Trunc(const Vec256 v) { + return Vec256{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256 Trunc(const Vec256 v) { + return Vec256{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} + +// Toward +infinity, aka ceiling +HWY_API Vec256 Ceil(const Vec256 v) { + return Vec256{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256 Ceil(const Vec256 v) { + return Vec256{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} + +// Toward -infinity, aka floor +HWY_API Vec256 Floor(const Vec256 v) { + return Vec256{ + _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec256 Floor(const Vec256 v) { + return Vec256{ + _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} + +// ------------------------------ Floating-point classification + +HWY_API Mask256 IsNaN(const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask256{_mm256_fpclass_ps_mask(v.raw, 0x81)}; +#else + return Mask256{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)}; +#endif +} +HWY_API Mask256 IsNaN(const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + return Mask256{_mm256_fpclass_pd_mask(v.raw, 0x81)}; +#else + return Mask256{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)}; +#endif +} + +#if HWY_TARGET <= HWY_AVX3 + +HWY_API Mask256 IsInf(const Vec256 v) { + return Mask256{_mm256_fpclass_ps_mask(v.raw, 0x18)}; +} +HWY_API Mask256 IsInf(const Vec256 v) { + return Mask256{_mm256_fpclass_pd_mask(v.raw, 0x18)}; +} + +HWY_API Mask256 IsFinite(const Vec256 v) { + // fpclass doesn't have a flag for positive, so we have to check for inf/NaN + // and negate the mask. + return Not(Mask256{_mm256_fpclass_ps_mask(v.raw, 0x99)}); +} +HWY_API Mask256 IsFinite(const Vec256 v) { + return Not(Mask256{_mm256_fpclass_pd_mask(v.raw, 0x99)}); +} + +#else + +template +HWY_API Mask256 IsInf(const Vec256 v) { + static_assert(IsFloat(), "Only for float"); + const Full256 d; + const RebindToSigned di; + const VFromD vi = BitCast(di, v); + // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. + return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); +} + +// Returns whether normal/subnormal/zero. +template +HWY_API Mask256 IsFinite(const Vec256 v) { + static_assert(IsFloat(), "Only for float"); + const Full256 d; + const RebindToUnsigned du; + const RebindToSigned di; // cheaper than unsigned comparison + const VFromD vu = BitCast(du, v); + // Shift left to clear the sign bit, then right so we can compare with the + // max exponent (cannot compare with MaxExponentTimes2 directly because it is + // negative and non-negative floats would be greater). MSVC seems to generate + // incorrect code if we instead add vu + vu. + const VFromD exp = + BitCast(di, ShiftRight() + 1>(ShiftLeft<1>(vu))); + return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ================================================== MEMORY + +// ------------------------------ Load + +template +HWY_API Vec256 Load(Full256 /* tag */, const T* HWY_RESTRICT aligned) { + return Vec256{ + _mm256_load_si256(reinterpret_cast(aligned))}; +} +HWY_API Vec256 Load(Full256 /* tag */, + const float* HWY_RESTRICT aligned) { + return Vec256{_mm256_load_ps(aligned)}; +} +HWY_API Vec256 Load(Full256 /* tag */, + const double* HWY_RESTRICT aligned) { + return Vec256{_mm256_load_pd(aligned)}; +} + +template +HWY_API Vec256 LoadU(Full256 /* tag */, const T* HWY_RESTRICT p) { + return Vec256{_mm256_loadu_si256(reinterpret_cast(p))}; +} +HWY_API Vec256 LoadU(Full256 /* tag */, + const float* HWY_RESTRICT p) { + return Vec256{_mm256_loadu_ps(p)}; +} +HWY_API Vec256 LoadU(Full256 /* tag */, + const double* HWY_RESTRICT p) { + return Vec256{_mm256_loadu_pd(p)}; +} + +// ------------------------------ MaskedLoad + +#if HWY_TARGET <= HWY_AVX3 + +template +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const T* HWY_RESTRICT p) { + return Vec256{_mm256_maskz_loadu_epi8(m.raw, p)}; +} + +template +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const T* HWY_RESTRICT p) { + return Vec256{_mm256_maskz_loadu_epi16(m.raw, p)}; +} + +template +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const T* HWY_RESTRICT p) { + return Vec256{_mm256_maskz_loadu_epi32(m.raw, p)}; +} + +template +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const T* HWY_RESTRICT p) { + return Vec256{_mm256_maskz_loadu_epi64(m.raw, p)}; +} + +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const float* HWY_RESTRICT p) { + return Vec256{_mm256_maskz_loadu_ps(m.raw, p)}; +} + +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const double* HWY_RESTRICT p) { + return Vec256{_mm256_maskz_loadu_pd(m.raw, p)}; +} + +#else // AVX2 + +// There is no maskload_epi8/16, so blend instead. +template * = nullptr> +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 d, + const T* HWY_RESTRICT p) { + return IfThenElseZero(m, LoadU(d, p)); +} + +template +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const T* HWY_RESTRICT p) { + auto pi = reinterpret_cast(p); // NOLINT + return Vec256{_mm256_maskload_epi32(pi, m.raw)}; +} + +template +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 /* tag */, + const T* HWY_RESTRICT p) { + auto pi = reinterpret_cast(p); // NOLINT + return Vec256{_mm256_maskload_epi64(pi, m.raw)}; +} + +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 d, + const float* HWY_RESTRICT p) { + const Vec256 mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + return Vec256{_mm256_maskload_ps(p, mi.raw)}; +} + +HWY_API Vec256 MaskedLoad(Mask256 m, Full256 d, + const double* HWY_RESTRICT p) { + const Vec256 mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + return Vec256{_mm256_maskload_pd(p, mi.raw)}; +} + +#endif + +// ------------------------------ LoadDup128 + +// Loads 128 bit and duplicates into both 128-bit halves. This avoids the +// 3-cycle cost of moving data between 128-bit halves and avoids port 5. +template +HWY_API Vec256 LoadDup128(Full256 /* tag */, const T* HWY_RESTRICT p) { +#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 + // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note + // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the + // upper half undefined) is fine because we're overwriting that anyway. + // This workaround seems in turn to generate incorrect code in MSVC 2022 + // (19.31), so use broadcastsi128 there. + const __m128i v128 = LoadU(Full128(), p).raw; + return Vec256{ + _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)}; +#else + return Vec256{_mm256_broadcastsi128_si256(LoadU(Full128(), p).raw)}; +#endif +} +HWY_API Vec256 LoadDup128(Full256 /* tag */, + const float* const HWY_RESTRICT p) { +#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 + const __m128 v128 = LoadU(Full128(), p).raw; + return Vec256{ + _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)}; +#else + return Vec256{_mm256_broadcast_ps(reinterpret_cast(p))}; +#endif +} +HWY_API Vec256 LoadDup128(Full256 /* tag */, + const double* const HWY_RESTRICT p) { +#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 + const __m128d v128 = LoadU(Full128(), p).raw; + return Vec256{ + _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)}; +#else + return Vec256{ + _mm256_broadcast_pd(reinterpret_cast(p))}; +#endif +} + +// ------------------------------ Store + +template +HWY_API void Store(Vec256 v, Full256 /* tag */, T* HWY_RESTRICT aligned) { + _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw); +} +HWY_API void Store(const Vec256 v, Full256 /* tag */, + float* HWY_RESTRICT aligned) { + _mm256_store_ps(aligned, v.raw); +} +HWY_API void Store(const Vec256 v, Full256 /* tag */, + double* HWY_RESTRICT aligned) { + _mm256_store_pd(aligned, v.raw); +} + +template +HWY_API void StoreU(Vec256 v, Full256 /* tag */, T* HWY_RESTRICT p) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw); +} +HWY_API void StoreU(const Vec256 v, Full256 /* tag */, + float* HWY_RESTRICT p) { + _mm256_storeu_ps(p, v.raw); +} +HWY_API void StoreU(const Vec256 v, Full256 /* tag */, + double* HWY_RESTRICT p) { + _mm256_storeu_pd(p, v.raw); +} + +// ------------------------------ BlendedStore + +#if HWY_TARGET <= HWY_AVX3 + +template +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi8(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi16(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi32(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 /* tag */, + T* HWY_RESTRICT p) { + _mm256_mask_storeu_epi64(p, m.raw, v.raw); +} + +HWY_API void BlendedStore(Vec256 v, Mask256 m, + Full256 /* tag */, float* HWY_RESTRICT p) { + _mm256_mask_storeu_ps(p, m.raw, v.raw); +} + +HWY_API void BlendedStore(Vec256 v, Mask256 m, + Full256 /* tag */, double* HWY_RESTRICT p) { + _mm256_mask_storeu_pd(p, m.raw, v.raw); +} + +#else // AVX2 + +// Intel SDM says "No AC# reported for any mask bit combinations". However, AMD +// allows AC# if "Alignment checking enabled and: 256-bit memory operand not +// 32-byte aligned". Fortunately AC# is not enabled by default and requires both +// OS support (CR0) and the application to set rflags.AC. We assume these remain +// disabled because x86/x64 code and compiler output often contain misaligned +// scalar accesses, which would also fault. +// +// Caveat: these are slow on AMD Jaguar/Bulldozer. + +template * = nullptr> +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT p) { + // There is no maskload_epi8/16. Blending is also unsafe because loading a + // full vector that crosses the array end causes asan faults. Resort to scalar + // code; the caller should instead use memcpy, assuming m is FirstN(d, n). + const RebindToUnsigned du; + using TU = TFromD; + alignas(32) TU buf[32 / sizeof(T)]; + alignas(32) TU mask[32 / sizeof(T)]; + Store(BitCast(du, v), du, buf); + Store(BitCast(du, VecFromMask(d, m)), du, mask); + for (size_t i = 0; i < 32 / sizeof(T); ++i) { + if (mask[i]) { + CopySameSize(buf + i, p + i); + } + } +} + +template +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 /* tag */, + T* HWY_RESTRICT p) { + auto pi = reinterpret_cast(p); // NOLINT + _mm256_maskstore_epi32(pi, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 /* tag */, + T* HWY_RESTRICT p) { + auto pi = reinterpret_cast(p); // NOLINT + _mm256_maskstore_epi64(pi, m.raw, v.raw); +} + +HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 d, + float* HWY_RESTRICT p) { + const Vec256 mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + _mm256_maskstore_ps(p, mi.raw, v.raw); +} + +HWY_API void BlendedStore(Vec256 v, Mask256 m, + Full256 d, double* HWY_RESTRICT p) { + const Vec256 mi = + BitCast(RebindToSigned(), VecFromMask(d, m)); + _mm256_maskstore_pd(p, mi.raw, v.raw); +} + +#endif + +// ------------------------------ Non-temporal stores + +template +HWY_API void Stream(Vec256 v, Full256 /* tag */, + T* HWY_RESTRICT aligned) { + _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw); +} +HWY_API void Stream(const Vec256 v, Full256 /* tag */, + float* HWY_RESTRICT aligned) { + _mm256_stream_ps(aligned, v.raw); +} +HWY_API void Stream(const Vec256 v, Full256 /* tag */, + double* HWY_RESTRICT aligned) { + _mm256_stream_pd(aligned, v.raw); +} + +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +#if HWY_TARGET <= HWY_AVX3 +namespace detail { + +template +HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1); +} +template +HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i32scatter_epi32(base, index.raw, v.raw, 4); +} + +template +HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1); +} +template +HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i64scatter_epi64(base, index.raw, v.raw, 8); +} + +} // namespace detail + +template +HWY_API void ScatterOffset(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); +} +template +HWY_API void ScatterIndex(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); +} + +HWY_API void ScatterOffset(Vec256 v, Full256 /* tag */, + float* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i32scatter_ps(base, offset.raw, v.raw, 1); +} +HWY_API void ScatterIndex(Vec256 v, Full256 /* tag */, + float* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i32scatter_ps(base, index.raw, v.raw, 4); +} + +HWY_API void ScatterOffset(Vec256 v, Full256 /* tag */, + double* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i64scatter_pd(base, offset.raw, v.raw, 1); +} +HWY_API void ScatterIndex(Vec256 v, Full256 /* tag */, + double* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i64scatter_pd(base, index.raw, v.raw, 8); +} + +#else + +template +HWY_API void ScatterOffset(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + constexpr size_t N = 32 / sizeof(T); + alignas(32) T lanes[N]; + Store(v, d, lanes); + + alignas(32) Offset offset_lanes[N]; + Store(offset, Full256(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + constexpr size_t N = 32 / sizeof(T); + alignas(32) T lanes[N]; + Store(v, d, lanes); + + alignas(32) Index index_lanes[N]; + Store(index, Full256(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +#endif + +// ------------------------------ Gather + +namespace detail { + +template +HWY_INLINE Vec256 GatherOffset(hwy::SizeTag<4> /* tag */, + Full256 /* tag */, + const T* HWY_RESTRICT base, + const Vec256 offset) { + return Vec256{_mm256_i32gather_epi32( + reinterpret_cast(base), offset.raw, 1)}; +} +template +HWY_INLINE Vec256 GatherIndex(hwy::SizeTag<4> /* tag */, + Full256 /* tag */, + const T* HWY_RESTRICT base, + const Vec256 index) { + return Vec256{_mm256_i32gather_epi32( + reinterpret_cast(base), index.raw, 4)}; +} + +template +HWY_INLINE Vec256 GatherOffset(hwy::SizeTag<8> /* tag */, + Full256 /* tag */, + const T* HWY_RESTRICT base, + const Vec256 offset) { + return Vec256{_mm256_i64gather_epi64( + reinterpret_cast(base), offset.raw, 1)}; +} +template +HWY_INLINE Vec256 GatherIndex(hwy::SizeTag<8> /* tag */, + Full256 /* tag */, + const T* HWY_RESTRICT base, + const Vec256 index) { + return Vec256{_mm256_i64gather_epi64( + reinterpret_cast(base), index.raw, 8)}; +} + +} // namespace detail + +template +HWY_API Vec256 GatherOffset(Full256 d, const T* HWY_RESTRICT base, + const Vec256 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::GatherOffset(hwy::SizeTag(), d, base, offset); +} +template +HWY_API Vec256 GatherIndex(Full256 d, const T* HWY_RESTRICT base, + const Vec256 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::GatherIndex(hwy::SizeTag(), d, base, index); +} + +HWY_API Vec256 GatherOffset(Full256 /* tag */, + const float* HWY_RESTRICT base, + const Vec256 offset) { + return Vec256{_mm256_i32gather_ps(base, offset.raw, 1)}; +} +HWY_API Vec256 GatherIndex(Full256 /* tag */, + const float* HWY_RESTRICT base, + const Vec256 index) { + return Vec256{_mm256_i32gather_ps(base, index.raw, 4)}; +} + +HWY_API Vec256 GatherOffset(Full256 /* tag */, + const double* HWY_RESTRICT base, + const Vec256 offset) { + return Vec256{_mm256_i64gather_pd(base, offset.raw, 1)}; +} +HWY_API Vec256 GatherIndex(Full256 /* tag */, + const double* HWY_RESTRICT base, + const Vec256 index) { + return Vec256{_mm256_i64gather_pd(base, index.raw, 8)}; +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== SWIZZLE + +// ------------------------------ LowerHalf + +template +HWY_API Vec128 LowerHalf(Full128 /* tag */, Vec256 v) { + return Vec128{_mm256_castsi256_si128(v.raw)}; +} +HWY_API Vec128 LowerHalf(Full128 /* tag */, Vec256 v) { + return Vec128{_mm256_castps256_ps128(v.raw)}; +} +HWY_API Vec128 LowerHalf(Full128 /* tag */, Vec256 v) { + return Vec128{_mm256_castpd256_pd128(v.raw)}; +} + +template +HWY_API Vec128 LowerHalf(Vec256 v) { + return LowerHalf(Full128(), v); +} + +// ------------------------------ UpperHalf + +template +HWY_API Vec128 UpperHalf(Full128 /* tag */, Vec256 v) { + return Vec128{_mm256_extracti128_si256(v.raw, 1)}; +} +HWY_API Vec128 UpperHalf(Full128 /* tag */, Vec256 v) { + return Vec128{_mm256_extractf128_ps(v.raw, 1)}; +} +HWY_API Vec128 UpperHalf(Full128 /* tag */, Vec256 v) { + return Vec128{_mm256_extractf128_pd(v.raw, 1)}; +} + +// ------------------------------ ExtractLane (Store) +template +HWY_API T ExtractLane(const Vec256 v, size_t i) { + const Full256 d; + HWY_DASSERT(i < Lanes(d)); + alignas(32) T lanes[32 / sizeof(T)]; + Store(v, d, lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane (Store) +template +HWY_API Vec256 InsertLane(const Vec256 v, size_t i, T t) { + const Full256 d; + HWY_DASSERT(i < Lanes(d)); + alignas(64) T lanes[64 / sizeof(T)]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ------------------------------ GetLane (LowerHalf) +template +HWY_API T GetLane(const Vec256 v) { + return GetLane(LowerHalf(v)); +} + +// ------------------------------ ZeroExtendVector + +// Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper +// bits undefined. Although it makes sense for them to be zero (VEX encoded +// 128-bit instructions zero the upper lanes to avoid large penalties), a +// compiler could decide to optimize out code that relies on this. +// +// The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the +// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For +// older GCC, we can still obtain the desired code thanks to pattern +// recognition; note that the expensive insert instruction is not actually +// generated, see https://gcc.godbolt.org/z/1MKGaP. + +#if !defined(HWY_HAVE_ZEXT) +#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \ + (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \ + (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000) +#define HWY_HAVE_ZEXT 1 +#else +#define HWY_HAVE_ZEXT 0 +#endif +#endif // defined(HWY_HAVE_ZEXT) + +template +HWY_API Vec256 ZeroExtendVector(Full256 /* tag */, Vec128 lo) { +#if HWY_HAVE_ZEXT +return Vec256{_mm256_zextsi128_si256(lo.raw)}; +#else + return Vec256{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)}; +#endif +} +HWY_API Vec256 ZeroExtendVector(Full256 /* tag */, + Vec128 lo) { +#if HWY_HAVE_ZEXT + return Vec256{_mm256_zextps128_ps256(lo.raw)}; +#else + return Vec256{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)}; +#endif +} +HWY_API Vec256 ZeroExtendVector(Full256 /* tag */, + Vec128 lo) { +#if HWY_HAVE_ZEXT + return Vec256{_mm256_zextpd128_pd256(lo.raw)}; +#else + return Vec256{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)}; +#endif +} + +// ------------------------------ Combine + +template +HWY_API Vec256 Combine(Full256 d, Vec128 hi, Vec128 lo) { + const auto lo256 = ZeroExtendVector(d, lo); + return Vec256{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)}; +} +HWY_API Vec256 Combine(Full256 d, Vec128 hi, + Vec128 lo) { + const auto lo256 = ZeroExtendVector(d, lo); + return Vec256{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)}; +} +HWY_API Vec256 Combine(Full256 d, Vec128 hi, + Vec128 lo) { + const auto lo256 = ZeroExtendVector(d, lo); + return Vec256{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)}; +} + +// ------------------------------ ShiftLeftBytes + +template +HWY_API Vec256 ShiftLeftBytes(Full256 /* tag */, const Vec256 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + // This is the same operation as _mm256_bslli_epi128. + return Vec256{_mm256_slli_si256(v.raw, kBytes)}; +} + +template +HWY_API Vec256 ShiftLeftBytes(const Vec256 v) { + return ShiftLeftBytes(Full256(), v); +} + +// ------------------------------ ShiftLeftLanes + +template +HWY_API Vec256 ShiftLeftLanes(Full256 d, const Vec256 v) { + const Repartition d8; + return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); +} + +template +HWY_API Vec256 ShiftLeftLanes(const Vec256 v) { + return ShiftLeftLanes(Full256(), v); +} + +// ------------------------------ ShiftRightBytes + +template +HWY_API Vec256 ShiftRightBytes(Full256 /* tag */, const Vec256 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + // This is the same operation as _mm256_bsrli_epi128. + return Vec256{_mm256_srli_si256(v.raw, kBytes)}; +} + +// ------------------------------ ShiftRightLanes +template +HWY_API Vec256 ShiftRightLanes(Full256 d, const Vec256 v) { + const Repartition d8; + return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); +} + +// ------------------------------ CombineShiftRightBytes + +// Extracts 128 bits from by skipping the least-significant kBytes. +template > +HWY_API V CombineShiftRightBytes(Full256 d, V hi, V lo) { + const Repartition d8; + return BitCast(d, Vec256{_mm256_alignr_epi8( + BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); +} + +// ------------------------------ Broadcast/splat any lane + +// Unsigned +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec256{_mm256_unpacklo_epi64(lo, lo)}; + } else { + const __m256i hi = + _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec256{_mm256_unpackhi_epi64(hi, hi)}; + } +} +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec256{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec256{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Signed +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec256{_mm256_unpacklo_epi64(lo, lo)}; + } else { + const __m256i hi = + _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec256{_mm256_unpackhi_epi64(hi, hi)}; + } +} +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec256{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)}; +} +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec256{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; +} + +// Float +template +HWY_API Vec256 Broadcast(Vec256 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; +} +template +HWY_API Vec256 Broadcast(const Vec256 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + return Vec256{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)}; +} + +// ------------------------------ Hard-coded shuffles + +// Notation: let Vec256 have lanes 7,6,5,4,3,2,1,0 (0 is +// least-significant). Shuffle0321 rotates four-lane blocks one lane to the +// right (the previous least-significant lane is now most-significant => +// 47650321). These could also be implemented via CombineShiftRightBytes but +// the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template +HWY_API Vec256 Shuffle2301(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0xB1)}; +} +HWY_API Vec256 Shuffle2301(const Vec256 v) { + return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)}; +} + +namespace detail { + +template +HWY_API Vec256 Shuffle2301(const Vec256 a, const Vec256 b) { + const Full256 d; + const RebindToFloat df; + constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); + return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} +template +HWY_API Vec256 Shuffle1230(const Vec256 a, const Vec256 b) { + const Full256 d; + const RebindToFloat df; + constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); + return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} +template +HWY_API Vec256 Shuffle3012(const Vec256 a, const Vec256 b) { + const Full256 d; + const RebindToFloat df; + constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); + return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, + BitCast(df, b).raw, m)}); +} + +} // namespace detail + +// Swap 64-bit halves +HWY_API Vec256 Shuffle1032(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256 Shuffle1032(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256 Shuffle1032(const Vec256 v) { + // Shorter encoding than _mm256_permute_ps. + return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)}; +} +HWY_API Vec256 Shuffle01(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256 Shuffle01(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; +} +HWY_API Vec256 Shuffle01(const Vec256 v) { + // Shorter encoding than _mm256_permute_pd. + return Vec256{_mm256_shuffle_pd(v.raw, v.raw, 5)}; +} + +// Rotate right 32 bits +HWY_API Vec256 Shuffle0321(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec256 Shuffle0321(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x39)}; +} +HWY_API Vec256 Shuffle0321(const Vec256 v) { + return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x39)}; +} +// Rotate left 32 bits +HWY_API Vec256 Shuffle2103(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec256 Shuffle2103(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x93)}; +} +HWY_API Vec256 Shuffle2103(const Vec256 v) { + return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x93)}; +} + +// Reverse +HWY_API Vec256 Shuffle0123(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec256 Shuffle0123(const Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, 0x1B)}; +} +HWY_API Vec256 Shuffle0123(const Vec256 v) { + return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)}; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. +template +struct Indices256 { + __m256i raw; +}; + +// Native 8x32 instruction: indices remain unchanged +template +HWY_API Indices256 IndicesFromVec(Full256 /* tag */, Vec256 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Full256 di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast(32 / sizeof(T)))))); +#endif + return Indices256{vec.raw}; +} + +// 64-bit lanes: convert indices to 8x32 unless AVX3 is available +template +HWY_API Indices256 IndicesFromVec(Full256 d, Vec256 idx64) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); + const Rebind di; + (void)di; // potentially unused +#if HWY_IS_DEBUG_BUILD + HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) && + AllTrue(di, Lt(idx64, Set(di, static_cast(32 / sizeof(T)))))); +#endif + +#if HWY_TARGET <= HWY_AVX3 + (void)d; + return Indices256{idx64.raw}; +#else + const Repartition df; // 32-bit! + // Replicate 64-bit index into upper 32 bits + const Vec256 dup = + BitCast(di, Vec256{_mm256_moveldup_ps(BitCast(df, idx64).raw)}); + // For each idx64 i, idx32 are 2*i and 2*i+1. + const Vec256 idx32 = dup + dup + Set(di, TI(1) << 32); + return Indices256{idx32.raw}; +#endif +} + +template +HWY_API Indices256 SetTableIndices(const Full256 d, const TI* idx) { + const Rebind di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template +HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { + return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; +} + +template +HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_permutexvar_epi64(idx.raw, v.raw)}; +#else + return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; +#endif +} + +HWY_API Vec256 TableLookupLanes(const Vec256 v, + const Indices256 idx) { + return Vec256{_mm256_permutevar8x32_ps(v.raw, idx.raw)}; +} + +HWY_API Vec256 TableLookupLanes(const Vec256 v, + const Indices256 idx) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_permutexvar_pd(idx.raw, v.raw)}; +#else + const Full256 df; + const Full256 du; + return BitCast(df, Vec256{_mm256_permutevar8x32_epi32( + BitCast(du, v).raw, idx.raw)}); +#endif +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { + return Vec256{_mm256_permute2x128_si256(v.raw, v.raw, 0x01)}; +} + +HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { + return Vec256{_mm256_permute2f128_ps(v.raw, v.raw, 0x01)}; +} + +HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { + return Vec256{_mm256_permute2f128_pd(v.raw, v.raw, 0x01)}; +} + +// ------------------------------ Reverse (RotateRight) + +template +HWY_API Vec256 Reverse(Full256 d, const Vec256 v) { + alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +template +HWY_API Vec256 Reverse(Full256 d, const Vec256 v) { + alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +template +HWY_API Vec256 Reverse(Full256 d, const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToSigned di; + alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0}; + const Vec256 idx = Load(di, kReverse); + return BitCast(d, Vec256{ + _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +#else + const RepartitionToWide> du32; + const Vec256 rev32 = Reverse(du32, BitCast(du32, v)); + return BitCast(d, RotateRight<16>(rev32)); +#endif +} + +// ------------------------------ Reverse2 + +template +HWY_API Vec256 Reverse2(Full256 d, const Vec256 v) { + const Full256 du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +} + +template +HWY_API Vec256 Reverse2(Full256 /* tag */, const Vec256 v) { + return Shuffle2301(v); +} + +template +HWY_API Vec256 Reverse2(Full256 /* tag */, const Vec256 v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 (SwapAdjacentBlocks) + +template +HWY_API Vec256 Reverse4(Full256 d, const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToSigned di; + alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4, + 11, 10, 9, 8, 15, 14, 13, 12}; + const Vec256 idx = Load(di, kReverse4); + return BitCast(d, Vec256{ + _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +#else + const RepartitionToWide dw; + return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v)))); +#endif +} + +template +HWY_API Vec256 Reverse4(Full256 /* tag */, const Vec256 v) { + return Shuffle0123(v); +} + +template +HWY_API Vec256 Reverse4(Full256 /* tag */, const Vec256 v) { + // Could also use _mm256_permute4x64_epi64. + return SwapAdjacentBlocks(Shuffle01(v)); +} + +// ------------------------------ Reverse8 + +template +HWY_API Vec256 Reverse8(Full256 d, const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToSigned di; + alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8}; + const Vec256 idx = Load(di, kReverse8); + return BitCast(d, Vec256{ + _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +#else + const RepartitionToWide dw; + return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); +#endif +} + +template +HWY_API Vec256 Reverse8(Full256 d, const Vec256 v) { + return Reverse(d, v); +} + +template +HWY_API Vec256 Reverse8(Full256 /* tag */, const Vec256 /* v */) { + HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes +} + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). + +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_ps(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveLower(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpacklo_pd(a.raw, b.raw)}; +} + +// ------------------------------ InterleaveUpper + +// All functions inside detail lack the required D parameter. +namespace detail { + +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_ps(a.raw, b.raw)}; +} +HWY_API Vec256 InterleaveUpper(const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_unpackhi_pd(a.raw, b.raw)}; +} + +} // namespace detail + +template > +HWY_API V InterleaveUpper(Full256 /* tag */, V a, V b) { + return detail::InterleaveUpper(a, b); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template > +HWY_API Vec256 ZipLower(Vec256 a, Vec256 b) { + return BitCast(Full256(), InterleaveLower(a, b)); +} +template > +HWY_API Vec256 ZipLower(Full256 dw, Vec256 a, Vec256 b) { + return BitCast(dw, InterleaveLower(a, b)); +} + +template > +HWY_API Vec256 ZipUpper(Full256 dw, Vec256 a, Vec256 b) { + return BitCast(dw, InterleaveUpper(Full256(), a, b)); +} + +// ------------------------------ Blocks (LowerHalf, ZeroExtendVector) + +// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL. +// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no +// extra cost) for LowerLower and UpperLower. + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template +HWY_API Vec256 ConcatLowerLower(Full256 d, const Vec256 hi, + const Vec256 lo) { + const Half d2; + return Vec256{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)}; +} +HWY_API Vec256 ConcatLowerLower(Full256 d, const Vec256 hi, + const Vec256 lo) { + const Half d2; + return Vec256{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)}; +} +HWY_API Vec256 ConcatLowerLower(Full256 d, + const Vec256 hi, + const Vec256 lo) { + const Half d2; + return Vec256{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)}; +} + +// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) +template +HWY_API Vec256 ConcatLowerUpper(Full256 /* tag */, const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)}; +} +HWY_API Vec256 ConcatLowerUpper(Full256 /* tag */, + const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)}; +} +HWY_API Vec256 ConcatLowerUpper(Full256 /* tag */, + const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)}; +} + +// hiH,hiL loH,loL |-> hiH,loL (= outer halves) +template +HWY_API Vec256 ConcatUpperLower(Full256 /* tag */, const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)}; +} +HWY_API Vec256 ConcatUpperLower(Full256 /* tag */, + const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)}; +} +HWY_API Vec256 ConcatUpperLower(Full256 /* tag */, + const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_blend_pd(hi.raw, lo.raw, 3)}; +} + +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template +HWY_API Vec256 ConcatUpperUpper(Full256 /* tag */, const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)}; +} +HWY_API Vec256 ConcatUpperUpper(Full256 /* tag */, + const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)}; +} +HWY_API Vec256 ConcatUpperUpper(Full256 /* tag */, + const Vec256 hi, + const Vec256 lo) { + return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)}; +} + +// ------------------------------ ConcatOdd + +template +HWY_API Vec256 ConcatOdd(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET == HWY_AVX3_DL + alignas(32) constexpr uint8_t kIdx[32] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)}); +#else + const RepartitionToWide dw; + // Unsigned 8-bit shift so we can pack. + const Vec256 uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec256 uL = ShiftRight<8>(BitCast(dw, lo)); + const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); + return Vec256{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template +HWY_API Vec256 ConcatOdd(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(32) constexpr uint16_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask16{0xFFFF}, BitCast(du, hi).raw)}); +#else + const RepartitionToWide dw; + // Unsigned 16-bit shift so we can pack. + const Vec256 uH = ShiftRight<16>(BitCast(dw, hi)); + const Vec256 uL = ShiftRight<16>(BitCast(dw, lo)); + const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); + return Vec256{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template +HWY_API Vec256 ConcatOdd(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF}, + BitCast(du, hi).raw)}); +#else + const RebindToFloat df; + const Vec256 v3131{_mm256_shuffle_ps( + BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))}; + return Vec256{_mm256_permute4x64_epi64(BitCast(du, v3131).raw, + _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +HWY_API Vec256 ConcatOdd(Full256 d, Vec256 hi, + Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return Vec256{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw, + __mmask8{0xFF}, hi.raw)}; +#else + const Vec256 v3131{ + _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))}; + return BitCast(d, Vec256{_mm256_permute4x64_epi64( + BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))}); +#endif +} + +template +HWY_API Vec256 ConcatOdd(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF}, + BitCast(du, hi).raw)}); +#else + const RebindToFloat df; + const Vec256 v31{ + _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)}; + return Vec256{ + _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +HWY_API Vec256 ConcatOdd(Full256 d, Vec256 hi, + Vec256 lo) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned du; + alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; + return Vec256{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw, + __mmask8{0xFF}, hi.raw)}; +#else + (void)d; + const Vec256 v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)}; + return Vec256{ + _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +// ------------------------------ ConcatEven + +template +HWY_API Vec256 ConcatEven(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET == HWY_AVX3_DL + alignas(64) constexpr uint8_t kIdx[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)}); +#else + const RepartitionToWide dw; + // Isolate lower 8 bits per u16 so we can pack. + const Vec256 mask = Set(dw, 0x00FF); + const Vec256 uH = And(BitCast(dw, hi), mask); + const Vec256 uL = And(BitCast(dw, lo), mask); + const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); + return Vec256{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template +HWY_API Vec256 ConcatEven(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) constexpr uint16_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask16{0xFFFF}, BitCast(du, hi).raw)}); +#else + const RepartitionToWide dw; + // Isolate lower 16 bits per u32 so we can pack. + const Vec256 mask = Set(dw, 0x0000FFFF); + const Vec256 uH = And(BitCast(dw, hi), mask); + const Vec256 uL = And(BitCast(dw, lo), mask); + const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); + return Vec256{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +template +HWY_API Vec256 ConcatEven(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF}, + BitCast(du, hi).raw)}); +#else + const RebindToFloat df; + const Vec256 v2020{_mm256_shuffle_ps( + BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))}; + return Vec256{_mm256_permute4x64_epi64(BitCast(du, v2020).raw, + _MM_SHUFFLE(3, 1, 2, 0))}; + +#endif +} + +HWY_API Vec256 ConcatEven(Full256 d, Vec256 hi, + Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return Vec256{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw, + __mmask8{0xFF}, hi.raw)}; +#else + const Vec256 v2020{ + _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; + return BitCast(d, Vec256{_mm256_permute4x64_epi64( + BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))}); + +#endif +} + +template +HWY_API Vec256 ConcatEven(Full256 d, Vec256 hi, Vec256 lo) { + const RebindToUnsigned du; +#if HWY_TARGET <= HWY_AVX3 + alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; + return BitCast(d, Vec256{_mm256_mask2_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF}, + BitCast(du, hi).raw)}); +#else + const RebindToFloat df; + const Vec256 v20{ + _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)}; + return Vec256{ + _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))}; + +#endif +} + +HWY_API Vec256 ConcatEven(Full256 d, Vec256 hi, + Vec256 lo) { +#if HWY_TARGET <= HWY_AVX3 + const RebindToUnsigned du; + alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; + return Vec256{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw, + __mmask8{0xFF}, hi.raw)}; +#else + (void)d; + const Vec256 v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)}; + return Vec256{ + _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))}; +#endif +} + +// ------------------------------ DupEven (InterleaveLower) + +template +HWY_API Vec256 DupEven(Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} +HWY_API Vec256 DupEven(Vec256 v) { + return Vec256{ + _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; +} + +template +HWY_API Vec256 DupEven(const Vec256 v) { + return InterleaveLower(Full256(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template +HWY_API Vec256 DupOdd(Vec256 v) { + return Vec256{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} +HWY_API Vec256 DupOdd(Vec256 v) { + return Vec256{ + _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; +} + +template +HWY_API Vec256 DupOdd(const Vec256 v) { + return InterleaveUpper(Full256(), v, v); +} + +// ------------------------------ OddEven + +namespace detail { + +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<1> /* tag */, const Vec256 a, + const Vec256 b) { + const Full256 d; + const Full256 d8; + alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a); +} +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<2> /* tag */, const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_blend_epi16(a.raw, b.raw, 0x55)}; +} +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<4> /* tag */, const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_blend_epi32(a.raw, b.raw, 0x55)}; +} +template +HWY_INLINE Vec256 OddEven(hwy::SizeTag<8> /* tag */, const Vec256 a, + const Vec256 b) { + return Vec256{_mm256_blend_epi32(a.raw, b.raw, 0x33)}; +} + +} // namespace detail + +template +HWY_API Vec256 OddEven(const Vec256 a, const Vec256 b) { + return detail::OddEven(hwy::SizeTag(), a, b); +} +HWY_API Vec256 OddEven(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_blend_ps(a.raw, b.raw, 0x55)}; +} + +HWY_API Vec256 OddEven(const Vec256 a, const Vec256 b) { + return Vec256{_mm256_blend_pd(a.raw, b.raw, 5)}; +} + +// ------------------------------ OddEvenBlocks + +template +Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { + return Vec256{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)}; +} + +HWY_API Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { + return Vec256{_mm256_blend_ps(odd.raw, even.raw, 0xFu)}; +} + +HWY_API Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { + return Vec256{_mm256_blend_pd(odd.raw, even.raw, 0x3u)}; +} + +// ------------------------------ ReverseBlocks (ConcatLowerUpper) + +template +HWY_API Vec256 ReverseBlocks(Full256 d, Vec256 v) { + return ConcatLowerUpper(d, v, v); +} + +// ------------------------------ TableLookupBytes (ZeroExtendVector) + +// Both full +template +HWY_API Vec256 TableLookupBytes(const Vec256 bytes, + const Vec256 from) { + return Vec256{_mm256_shuffle_epi8(bytes.raw, from.raw)}; +} + +// Partial index vector +template +HWY_API Vec128 TableLookupBytes(const Vec256 bytes, + const Vec128 from) { + // First expand to full 128, then 256. + const auto from_256 = ZeroExtendVector(Full256(), Vec128{from.raw}); + const auto tbl_full = TableLookupBytes(bytes, from_256); + // Shrink to 128, then partial. + return Vec128{LowerHalf(Full128(), tbl_full).raw}; +} + +// Partial table vector +template +HWY_API Vec256 TableLookupBytes(const Vec128 bytes, + const Vec256 from) { + // First expand to full 128, then 256. + const auto bytes_256 = ZeroExtendVector(Full256(), Vec128{bytes.raw}); + return TableLookupBytes(bytes_256, from); +} + +// Partial both are handled by x86_128. + +// ------------------------------ Shl (Mul, ZipLower) + +namespace detail { + +#if HWY_TARGET > HWY_AVX3 // AVX2 or older + +// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. +template +HWY_INLINE Vec256> Pow2(const Vec256 v) { + static_assert(sizeof(T) == 2, "Only for 16-bit"); + const Full256 d; + const RepartitionToWide dw; + const Rebind df; + const auto zero = Zero(d); + // Move into exponent (this u16 will become the upper half of an f32) + const auto exp = ShiftLeft<23 - 16>(v); + const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f + // Insert 0 into lower halves for reinterpreting as binary32. + const auto f0 = ZipLower(dw, zero, upper); + const auto f1 = ZipUpper(dw, zero, upper); + // Do not use ConvertTo because it checks for overflow, which is redundant + // because we only care about v in [0, 16). + const Vec256 bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)}; + const Vec256 bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)}; + return Vec256>{_mm256_packus_epi32(bits0.raw, bits1.raw)}; +} + +#endif // HWY_TARGET > HWY_AVX3 + +HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, + Vec256 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_sllv_epi16(v.raw, bits.raw)}; +#else + return v * Pow2(bits); +#endif +} + +HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, + Vec256 bits) { + return Vec256{_mm256_sllv_epi32(v.raw, bits.raw)}; +} + +HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, + Vec256 bits) { + return Vec256{_mm256_sllv_epi64(v.raw, bits.raw)}; +} + +template +HWY_INLINE Vec256 Shl(hwy::SignedTag /*tag*/, Vec256 v, Vec256 bits) { + // Signed left shifts are the same as unsigned. + const Full256 di; + const Full256> du; + return BitCast(di, + Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); +} + +} // namespace detail + +template +HWY_API Vec256 operator<<(Vec256 v, Vec256 bits) { + return detail::Shl(hwy::TypeTag(), v, bits); +} + +// ------------------------------ Shr (MulHigh, IfThenElse, Not) + +HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_srlv_epi16(v.raw, bits.raw)}; +#else + Full256 d; + // For bits=0, we cannot mul by 2^16, so fix the result later. + auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits)); + // Replace output with input where bits == 0. + return IfThenElse(bits == Zero(d), v, out); +#endif +} + +HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { + return Vec256{_mm256_srlv_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { + return Vec256{_mm256_srlv_epi64(v.raw, bits.raw)}; +} + +HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_srav_epi16(v.raw, bits.raw)}; +#else + return detail::SignedShr(Full256(), v, bits); +#endif +} + +HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { + return Vec256{_mm256_srav_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_srav_epi64(v.raw, bits.raw)}; +#else + return detail::SignedShr(Full256(), v, bits); +#endif +} + +HWY_INLINE Vec256 MulEven(const Vec256 a, + const Vec256 b) { + const DFromV du64; + const RepartitionToNarrow du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need the lower 32 bits + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need + // the even (lower 64 bits of every 128-bit block) results. See + // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveLower(mulL, mulH); +} + +HWY_INLINE Vec256 MulOdd(const Vec256 a, + const Vec256 b) { + const DFromV du64; + const RepartitionToNarrow du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need bits [95:64] (= upper half of input) + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Same as above, but we're using the odd results (upper 64 bits per block). + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveUpper(du64, mulL, mulH); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +HWY_API Vec256 ReorderWidenMulAccumulate(Full256 df32, + Vec256 a, + Vec256 b, + const Vec256 sum0, + Vec256& sum1) { + // TODO(janwas): _mm256_dpbf16_ps when available + const Repartition du16; + const RebindToUnsigned du32; + const Vec256 zero = Zero(du16); + // Lane order within sum0/1 is undefined, hence we can avoid the + // longer-latency lane-crossing PromoteTo. + const Vec256 a0 = ZipLower(du32, zero, BitCast(du16, a)); + const Vec256 a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const Vec256 b0 = ZipLower(du32, zero, BitCast(du16, b)); + const Vec256 b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +HWY_API Vec256 ReorderWidenMulAccumulate(Full256 /*d32*/, + Vec256 a, + Vec256 b, + const Vec256 sum0, + Vec256& /*sum1*/) { + return sum0 + Vec256{_mm256_madd_epi16(a.raw, b.raw)}; +} + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{_mm256_cvtps_pd(v.raw)}; +} + +HWY_API Vec256 PromoteTo(Full256 /* tag */, + const Vec128 v) { + return Vec256{_mm256_cvtepi32_pd(v.raw)}; +} + +// Unsigned: zero-extend. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then Zip* would be faster. +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepu8_epi16(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepu8_epi32(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepu8_epi16(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepu8_epi32(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepu16_epi32(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepu16_epi32(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepu32_epi64(v.raw)}; +} + +// Signed: replicate sign bit. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by +// signed shift would be faster. +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepi8_epi16(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepi8_epi32(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepi16_epi32(v.raw)}; +} +HWY_API Vec256 PromoteTo(Full256 /* tag */, + Vec128 v) { + return Vec256{_mm256_cvtepi32_epi64(v.raw)}; +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw); + // Concatenating lower halves of both 128-bit blocks afterward is more + // efficient than an extra input with low block = high block of v. + return Vec128{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw); + return Vec128{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))}; +} + +HWY_API Vec128 DemoteTo(Full64 /* tag */, + const Vec256 v) { + const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw); + // Concatenate lower 64 bits of each 128-bit block + const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88); + const __m128i u16 = _mm256_castsi256_si128(u16_concat); + // packus treats the input as signed; we want unsigned. Clear the MSB to get + // unsigned saturation to u8. + const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF)); + return Vec128{_mm_packus_epi16(i16, i16)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw); + return Vec128{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))}; +} + +HWY_API Vec128 DemoteTo(Full64 /* tag */, + const Vec256 v) { + const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); + // Concatenate lower 64 bits of each 128-bit block + const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88); + const __m128i i16 = _mm256_castsi256_si128(i16_concat); + return Vec128{_mm_packs_epi16(i16, i16)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw); + return Vec128{ + _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))}; +} + + // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'". + // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion") + +HWY_API Vec128 DemoteTo(Full128 df16, + const Vec256 v) { +#ifdef HWY_DISABLE_F16C + const RebindToUnsigned du16; + const Rebind du; + const RebindToSigned di; + const auto bits32 = BitCast(du, v); + const auto sign = ShiftRight<31>(bits32); + const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); + const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); + + const auto k15 = Set(di, 15); + const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); + const auto is_tiny = exp < Set(di, -24); + + const auto is_subnormal = exp < Set(di, -14); + const auto biased_exp16 = + BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); + const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) + const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + + (mantissa32 >> (Set(du, 13) + sub_exp)); + const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, + ShiftRight<13>(mantissa32)); // <1024 + + const auto sign16 = ShiftLeft<15>(sign); + const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; + const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); + return BitCast(df16, DemoteTo(du16, bits16)); +#else + (void)df16; + return Vec128{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; +#endif +} + +HWY_DIAGNOSTICS(pop) + +HWY_API Vec128 DemoteTo(Full128 dbf16, + const Vec256 v) { + // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16. + const Rebind di32; + const Rebind du32; // for logical shift right + const Rebind du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +HWY_API Vec256 ReorderDemote2To(Full256 dbf16, + Vec256 a, Vec256 b) { + // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16. + const RebindToUnsigned du16; + const Repartition du32; + const Vec256 b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +HWY_API Vec256 ReorderDemote2To(Full256 /*d16*/, + Vec256 a, Vec256 b) { + return Vec256{_mm256_packs_epi32(a.raw, b.raw)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + return Vec128{_mm256_cvtpd_ps(v.raw)}; +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + const auto clamped = detail::ClampF64ToI32Max(Full256(), v); + return Vec128{_mm256_cvttpd_epi32(clamped.raw)}; +} + +// For already range-limited input [0, 255]. +HWY_API Vec128 U8FromU32(const Vec256 v) { + const Full256 d32; + alignas(32) static constexpr uint32_t k8From32[8] = { + 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u}; + // Place first four bytes in lo[0], remaining 4 in hi[1]. + const auto quad = TableLookupBytes(v, Load(d32, k8From32)); + // Interleave both quadruplets - OR instead of unpack reduces port5 pressure. + const auto lo = LowerHalf(quad); + const auto hi = UpperHalf(Full128(), quad); + const auto pair = LowerHalf(lo | hi); + return BitCast(Full64(), pair); +} + +// ------------------------------ Truncations + +namespace detail { + +// LO and HI each hold four indices of bytes within a 128-bit block. +template +HWY_INLINE Vec128 LookupAndConcatHalves(Vec256 v) { + const Full256 d32; + +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) constexpr uint32_t kMap[8] = { + LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0}; + const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw); +#else + alignas(32) static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u, + ~0u, ~0u, LO, HI}; + const auto quad = TableLookupBytes(v, Load(d32, kMap)); + const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC); + // Possible alternative: + // const auto lo = LowerHalf(quad); + // const auto hi = UpperHalf(Full128(), quad); + // const auto result = lo | hi; +#endif + + return Vec128{_mm256_castsi256_si128(result)}; +} + +// LO and HI each hold two indices of bytes within a 128-bit block. +template +HWY_INLINE Vec128 LookupAndConcatQuarters(Vec256 v) { + const Full256 d16; + +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) constexpr uint16_t kMap[16] = { + LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const auto result = _mm256_permutexvar_epi8(v.raw, Load(d16, kMap).raw); + return LowerHalf(Vec128{_mm256_castsi256_si128(result)}); +#else + constexpr uint16_t ff = static_cast(~0u); + alignas(32) static constexpr uint16_t kMap[16] = { + LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff}; + const auto quad = TableLookupBytes(v, Load(d16, kMap)); + const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC); + const auto half = _mm256_castsi256_si128(mixed); + return LowerHalf(Vec128{_mm_packus_epi32(half, half)}); +#endif +} + +} // namespace detail + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec256 v) { + const Full256 d32; +#if HWY_TARGET <= HWY_AVX3_DL + alignas(32) constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0}; + const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw); + return LowerHalf(LowerHalf(LowerHalf(Vec256{result}))); +#else + alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u, + 0x0800FFFFu, ~0u, ~0u, ~0u}; + const auto quad = TableLookupBytes(v, Load(d32, kMap)); + const auto lo = LowerHalf(quad); + const auto hi = UpperHalf(Full128(), quad); + const auto result = lo | hi; + return LowerHalf(LowerHalf(Vec128{result.raw})); +#endif +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec256 v) { + const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v); + return Vec128{result.raw}; +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec256 v) { + const Full256 d32; + alignas(32) constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto v32 = + TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven)); + return LowerHalf(Vec256{v32.raw}); +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec256 v) { + const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v); + return Vec128{full.raw}; +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec256 v) { + const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v); + return Vec128{full.raw}; +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec256 v) { + const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v); + return Vec128{full.raw}; +} + +// ------------------------------ Integer <=> fp (ShiftRight, OddEven) + +HWY_API Vec256 ConvertTo(Full256 /* tag */, + const Vec256 v) { + return Vec256{_mm256_cvtepi32_ps(v.raw)}; +} + +HWY_API Vec256 ConvertTo(Full256 dd, const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + (void)dd; + return Vec256{_mm256_cvtepi64_pd(v.raw)}; +#else + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const Repartition d32; + const Repartition d64; + + // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 + const auto k84_63 = Set(d64, 0x4530000080000000ULL); + const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); + + // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) + const auto k52 = Set(d32, 0x43300000); + const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); + + const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); + return (v_upper - k84_63_52) + v_lower; // order matters! +#endif +} + +HWY_API Vec256 ConvertTo(HWY_MAYBE_UNUSED Full256 df, + const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_cvtepu32_ps(v.raw)}; +#else + // Based on wim's approach (https://stackoverflow.com/questions/34066228/) + const RebindToUnsigned du32; + const RebindToSigned d32; + + const auto msk_lo = Set(du32, 0xFFFF); + const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16 + + // Extract the 16 lowest/highest significant bits of v and cast to signed int + const auto v_lo = BitCast(d32, And(v, msk_lo)); + const auto v_hi = BitCast(d32, ShiftRight<16>(v)); + + return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo)); +#endif +} + +HWY_API Vec256 ConvertTo(HWY_MAYBE_UNUSED Full256 dd, + const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + return Vec256{_mm256_cvtepu64_pd(v.raw)}; +#else + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const RebindToUnsigned d64; + using VU = VFromD; + + const VU msk_lo = Set(d64, 0xFFFFFFFFULL); + const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 + + // Extract the 32 lowest significant bits of v + const VU v_lo = And(v, msk_lo); + const VU v_hi = ShiftRight<32>(v); + + auto uint64_to_double256_fast = [&dd](Vec256 w) HWY_ATTR { + w = Or(w, Vec256{ + detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)}); + return BitCast(dd, w) - Set(dd, 0x0010000000000000); + }; + + const auto v_lo_dbl = uint64_to_double256_fast(v_lo); + return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl); +#endif +} + +// Truncates (rounds toward zero). +HWY_API Vec256 ConvertTo(Full256 d, const Vec256 v) { + return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw)); +} + +HWY_API Vec256 ConvertTo(Full256 di, const Vec256 v) { +#if HWY_TARGET <= HWY_AVX3 + return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw)); +#else + using VI = decltype(Zero(di)); + const VI k0 = Zero(di); + const VI k1 = Set(di, 1); + const VI k51 = Set(di, 51); + + // Exponent indicates whether the number can be represented as int64_t. + const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF); + const VI exp = biased_exp - Set(di, 0x3FF); + const auto in_range = exp < Set(di, 63); + + // If we were to cap the exponent at 51 and add 2^52, the number would be in + // [2^52, 2^53) and mantissa bits could be read out directly. We need to + // round-to-0 (truncate), but changing rounding mode in MXCSR hits a + // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead + // manually shift the mantissa into place (we already have many of the + // inputs anyway). + const VI shift_mnt = Max(k51 - exp, k0); + const VI shift_int = Max(exp - k51, k0); + const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1); + // Include implicit 1-bit; shift by one more to ensure it's in the mantissa. + const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1); + // For inputs larger than 2^52, insert zeros at the bottom. + const VI shifted = int52 << shift_int; + // Restore the one bit lost when shifting in the implicit 1-bit. + const VI restored = shifted | ((mantissa & k1) << (shift_int - k1)); + + // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. + const VI sign_mask = BroadcastSignBit(BitCast(di, v)); + const VI limit = Set(di, LimitsMax()) - sign_mask; + const VI magnitude = IfThenElse(in_range, restored, limit); + + // If the input was negative, negate the integer (two's complement). + return (magnitude ^ sign_mask) - sign_mask; +#endif +} + +HWY_API Vec256 NearestInt(const Vec256 v) { + const Full256 di; + return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw)); +} + + +HWY_API Vec256 PromoteTo(Full256 df32, + const Vec128 v) { +#ifdef HWY_DISABLE_F16C + const RebindToSigned di32; + const RebindToUnsigned du32; + // Expand to u32 so we can shift. + const auto bits16 = PromoteTo(du32, Vec128{v.raw}); + const auto sign = ShiftRight<15>(bits16); + const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); + const auto mantissa = bits16 & Set(du32, 0x3FF); + const auto subnormal = + BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * + Set(df32, 1.0f / 16384 / 1024)); + + const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); + const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); + const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; + const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); + return BitCast(df32, ShiftLeft<31>(sign) | bits32); +#else + (void)df32; + return Vec256{_mm256_cvtph_ps(v.raw)}; +#endif +} + +HWY_API Vec256 PromoteTo(Full256 df32, + const Vec128 v) { + const Rebind du16; + const RebindToSigned di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +// ================================================== CRYPTO + +#if !defined(HWY_DISABLE_PCLMUL_AES) + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API Vec256 AESRound(Vec256 state, + Vec256 round_key) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec256{_mm256_aesenc_epi128(state.raw, round_key.raw)}; +#else + const Full256 d; + const Half d2; + return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec256 AESLastRound(Vec256 state, + Vec256 round_key) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec256{_mm256_aesenclast_epi128(state.raw, round_key.raw)}; +#else + const Full256 d; + const Half d2; + return Combine(d, + AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESLastRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec256 CLMulLower(Vec256 a, Vec256 b) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec256{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)}; +#else + const Full256 d; + const Half d2; + return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)), + CLMulLower(LowerHalf(a), LowerHalf(b))); +#endif +} + +HWY_API Vec256 CLMulUpper(Vec256 a, Vec256 b) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec256{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)}; +#else + const Full256 d; + const Half d2; + return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)), + CLMulUpper(LowerHalf(a), LowerHalf(b))); +#endif +} + +#endif // HWY_DISABLE_PCLMUL_AES + +// ================================================== MISC + +// Returns a vector with lane i=[0, N) set to "first" + i. +template +HWY_API Vec256 Iota(const Full256 d, const T2 first) { + HWY_ALIGN T lanes[32 / sizeof(T)]; + for (size_t i = 0; i < 32 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + +#if HWY_TARGET <= HWY_AVX3 + +// ------------------------------ LoadMaskBits + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask256 LoadMaskBits(const Full256 /* tag */, + const uint8_t* HWY_RESTRICT bits) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes(bits, &mask_bits); + + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return Mask256::FromBits(mask_bits); +} + +// ------------------------------ StoreMaskBits + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(const Full256 /* tag */, const Mask256 mask, + uint8_t* bits) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + CopyBytes(&mask.raw, bits); + + // Non-full byte, need to clear the undefined upper bits. + if (N < 8) { + const int mask_bits = static_cast((1ull << N) - 1); + bits[0] = static_cast(bits[0] & mask_bits); + } + return kNumBytes; +} + +// ------------------------------ Mask testing + +template +HWY_API size_t CountTrue(const Full256 /* tag */, const Mask256 mask) { + return PopCount(static_cast(mask.raw)); +} + +template +HWY_API size_t FindKnownFirstTrue(const Full256 /* tag */, + const Mask256 mask) { + return Num0BitsBelowLS1Bit_Nonzero32(mask.raw); +} + +template +HWY_API intptr_t FindFirstTrue(const Full256 d, const Mask256 mask) { + return mask.raw ? static_cast(FindKnownFirstTrue(d, mask)) + : intptr_t{-1}; +} + +// Beware: the suffix indicates the number of mask bits, not lane size! + +namespace detail { + +template +HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template +HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template +HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template +HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256 mask) { + return (uint64_t{mask.raw} & 0xF) == 0; +} + +} // namespace detail + +template +HWY_API bool AllFalse(const Full256 /* tag */, const Mask256 mask) { + return detail::AllFalse(hwy::SizeTag(), mask); +} + +namespace detail { + +template +HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFFFFFu; +#endif +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFu; +#endif +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFu; +#endif +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256 mask) { + // Cannot use _kortestc because we have less than 8 mask bits. + return mask.raw == 0xFu; +} + +} // namespace detail + +template +HWY_API bool AllTrue(const Full256 /* tag */, const Mask256 mask) { + return detail::AllTrue(hwy::SizeTag(), mask); +} + +// ------------------------------ Compress + +// 16-bit is defined in x86_512 so we can use 512-bit vectors. + +template +HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { + return Vec256{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; +} + +HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { + return Vec256{_mm256_maskz_compress_ps(mask.raw, v.raw)}; +} + +template +HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { + // See CompressIsPartition. + alignas(16) constexpr uint64_t packed_array[16] = { + // PrintCompress64x4NibbleTables + 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120, + 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310, + 0x00001032, 0x00001320, 0x00000321, 0x00003210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 2) - + // _mm256_permutexvar_epi64 will ignore the upper bits. + const Full256 d; + const RebindToUnsigned du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12}; + const auto indices = Indices256{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// ------------------------------ CompressNot (Compress) + +template +HWY_API Vec256 CompressNot(Vec256 v, const Mask256 mask) { + return Compress(v, Not(mask)); +} + +template +HWY_API Vec256 CompressNot(Vec256 v, Mask256 mask) { + // See CompressIsPartition. + alignas(16) constexpr uint64_t packed_array[16] = { + // PrintCompressNot64x4NibbleTables + 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031, + 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102, + 0x00003210, 0x00003201, 0x00003210, 0x00003210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 2) - + // _mm256_permutexvar_epi64 will ignore the upper bits. + const Full256 d; + const RebindToUnsigned du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12}; + const auto indices = Indices256{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// ------------------------------ CompressBlocksNot +HWY_API Vec256 CompressBlocksNot(Vec256 v, + Mask256 mask) { + return CompressNot(v, mask); +} + +// ------------------------------ CompressBits (LoadMaskBits) +template +HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(Full256(), bits)); +} + +// ------------------------------ CompressStore + +template +HWY_API size_t CompressStore(Vec256 v, Mask256 mask, Full256 d, + T* HWY_RESTRICT unaligned) { + const Rebind du; + const auto vu = BitCast(du, v); // (required for float16_t inputs) + + const uint64_t mask_bits{mask.raw}; + +#if HWY_TARGET == HWY_AVX3_DL // VBMI2 + _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw); +#else + // Split into halves to keep the table size manageable. + const Half duh; + const auto vL = LowerHalf(duh, vu); + const auto vH = UpperHalf(duh, vu); + + const uint64_t mask_bitsL = mask_bits & 0xFF; + const uint64_t mask_bitsH = mask_bits >> 8; + + const auto idxL = detail::IndicesForCompress16(mask_bitsL); + const auto idxH = detail::IndicesForCompress16(mask_bitsH); + + // Compress and 128-bit halves. + const Vec128 cL{_mm_permutexvar_epi16(idxL.raw, vL.raw)}; + const Vec128 cH{_mm_permutexvar_epi16(idxH.raw, vH.raw)}; + const Half dh; + StoreU(BitCast(dh, cL), dh, unaligned); + StoreU(BitCast(dh, cH), dh, unaligned + PopCount(mask_bitsL)); +#endif // HWY_TARGET == HWY_AVX3_DL + + return PopCount(mask_bits); +} + +template +HWY_API size_t CompressStore(Vec256 v, Mask256 mask, Full256 /* tag */, + T* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw}); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressStore(Vec256 v, Mask256 mask, Full256 /* tag */, + T* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw} & 0xFull); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +HWY_API size_t CompressStore(Vec256 v, Mask256 mask, + Full256 /* tag */, + float* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw}); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(float)); +#endif + return count; +} + +HWY_API size_t CompressStore(Vec256 v, Mask256 mask, + Full256 /* tag */, + double* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw} & 0xFull); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(double)); +#endif + return count; +} + +// ------------------------------ CompressBlendedStore (CompressStore) + +template +HWY_API size_t CompressBlendedStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT unaligned) { + // Native (32 or 64-bit) AVX-512 instruction already does the blending at no + // extra cost (latency 11, rthroughput 2 - same as compress plus store). + return CompressStore(v, m, d, unaligned); +} + +template +HWY_API size_t CompressBlendedStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT unaligned) { +#if HWY_TARGET <= HWY_AVX3_DL + return CompressStore(v, m, d, unaligned); // also native +#else + const size_t count = CountTrue(d, m); + BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +#endif +} + +// ------------------------------ CompressBitsStore (LoadMaskBits) + +template +HWY_API size_t CompressBitsStore(Vec256 v, const uint8_t* HWY_RESTRICT bits, + Full256 d, T* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +#else // AVX2 + +// ------------------------------ LoadMaskBits (TestBit) + +namespace detail { + +// 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_LE128 there. +template +HWY_INLINE Mask256 LoadMaskBits256(Full256 d, uint64_t mask_bits) { + const RebindToUnsigned du; + const Repartition du32; + const auto vbits = BitCast(du, Set(du32, static_cast(mask_bits))); + + // Replicate bytes 8x such that each byte contains the bit that governs it. + const Repartition du64; + alignas(32) constexpr uint64_t kRep8[4] = { + 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull, + 0x0303030303030303ull}; + const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8))); + + alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); +} + +template +HWY_INLINE Mask256 LoadMaskBits256(Full256 d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(32) constexpr uint16_t kBit[16] = { + 1, 2, 4, 8, 16, 32, 64, 128, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + const auto vmask_bits = Set(du, static_cast(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template +HWY_INLINE Mask256 LoadMaskBits256(Full256 d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + const auto vmask_bits = Set(du, static_cast(mask_bits)); + return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); +} + +template +HWY_INLINE Mask256 LoadMaskBits256(Full256 d, uint64_t mask_bits) { + const RebindToUnsigned du; + alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8}; + return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); +} + +} // namespace detail + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask256 LoadMaskBits(Full256 d, + const uint8_t* HWY_RESTRICT bits) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes(bits, &mask_bits); + + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::LoadMaskBits256(d, mask_bits); +} + +// ------------------------------ StoreMaskBits + +namespace detail { + +template +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { + const Full256 d; + const Full256 d8; + const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw; + // Prevent sign-extension of 32-bit masks because the intrinsic returns int. + return static_cast(_mm256_movemask_epi8(sign_bits)); +} + +template +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { +#if HWY_ARCH_X86_64 + const Full256 d; + const Full256 d8; + const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + const uint64_t sign_bits8 = BitsFromMask(mask8); + // Skip the bits from the lower byte of each u16 (better not to use the + // same packs_epi16 as SSE4, because that requires an extra swizzle here). + return _pext_u64(sign_bits8, 0xAAAAAAAAull); +#else + // Slow workaround for 32-bit builds, which lack _pext_u64. + // Remove useless lower half of each u16 while preserving the sign bit. + // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes. + const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256()); + // Move odd qwords (value zero) to top so they don't affect the mask value. + const auto compressed = + _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0)); + return static_cast(_mm256_movemask_epi8(compressed)); +#endif // HWY_ARCH_X86_64 +} + +template +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { + const Full256 d; + const Full256 df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; + return static_cast(_mm256_movemask_ps(sign_bits)); +} + +template +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { + const Full256 d; + const Full256 df; + const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; + return static_cast(_mm256_movemask_pd(sign_bits)); +} + +} // namespace detail + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(const Full256 /* tag */, const Mask256 mask, + uint8_t* bits) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + const uint64_t mask_bits = detail::BitsFromMask(mask); + CopyBytes(&mask_bits, bits); + return kNumBytes; +} + +// ------------------------------ Mask testing + +// Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask +// lane is 0 or ~0. +template +HWY_API bool AllFalse(const Full256 d, const Mask256 mask) { + const Repartition d8; + const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + return detail::BitsFromMask(mask8) == 0; +} + +template +HWY_API bool AllFalse(const Full256 /* tag */, const Mask256 mask) { + // Cheaper than PTEST, which is 2 uop / 3L. + return detail::BitsFromMask(mask) == 0; +} + +template +HWY_API bool AllTrue(const Full256 d, const Mask256 mask) { + const Repartition d8; + const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + return detail::BitsFromMask(mask8) == (1ull << 32) - 1; +} +template +HWY_API bool AllTrue(const Full256 /* tag */, const Mask256 mask) { + constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1; + return detail::BitsFromMask(mask) == kAllBits; +} + +template +HWY_API size_t CountTrue(const Full256 d, const Mask256 mask) { + const Repartition d8; + const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); + return PopCount(detail::BitsFromMask(mask8)) >> 1; +} +template +HWY_API size_t CountTrue(const Full256 /* tag */, const Mask256 mask) { + return PopCount(detail::BitsFromMask(mask)); +} + +template +HWY_API size_t FindKnownFirstTrue(const Full256 /* tag */, + const Mask256 mask) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + return Num0BitsBelowLS1Bit_Nonzero64(mask_bits); +} + +template +HWY_API intptr_t FindFirstTrue(const Full256 /* tag */, + const Mask256 mask) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; +} + +// ------------------------------ Compress, CompressBits + +namespace detail { + +template +HWY_INLINE Vec256 IndicesFromBits(Full256 d, uint64_t mask_bits) { + const RebindToUnsigned d32; + // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT + // of SetTableIndices would require 8 KiB, a large part of L1D. The other + // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) + // and unavailable in 32-bit builds. We instead compress each index into 4 + // bits, for a total of 1 KiB. + alignas(16) constexpr uint32_t packed_array[256] = { + // PrintCompress32x8Tables + 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8, + 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98, + 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8, + 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98, + 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8, + 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98, + 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8, + 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98, + 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8, + 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98, + 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8, + 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98, + 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8, + 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98, + 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8, + 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98, + 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8, + 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98, + 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8, + 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98, + 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8, + 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98, + 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8, + 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98, + 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8, + 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98, + 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8, + 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98, + 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8, + 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98, + 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8, + 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98, + 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8, + 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98, + 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8, + 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98, + 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8, + 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98, + 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8, + 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98, + 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8, + 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98, + 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98}; + + // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31. + // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. + // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing + // latency, it may be faster to use LoadDup128 and PSHUFB. + const auto packed = Set(d32, packed_array[mask_bits]); + alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; + return packed >> Load(d32, shifts); +} + +template +HWY_INLINE Vec256 IndicesFromBits(Full256 d, uint64_t mask_bits) { + const Repartition d32; + + // For 64-bit, we still need 32-bit indices because there is no 64-bit + // permutevar, but there are only 4 lanes, so we can afford to skip the + // unpacking and load the entire index vector directly. + alignas(32) constexpr uint32_t u32_indices[128] = { + // PrintCompress64x4PairTables + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, + 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, + 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7, + 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7, + 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5, + 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5, + 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3, + 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15}; + return Load(d32, u32_indices + 8 * mask_bits); +} + +template +HWY_INLINE Vec256 IndicesFromNotBits(Full256 d, + uint64_t mask_bits) { + const RebindToUnsigned d32; + // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT + // of SetTableIndices would require 8 KiB, a large part of L1D. The other + // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) + // and unavailable in 32-bit builds. We instead compress each index into 4 + // bits, for a total of 1 KiB. + alignas(16) constexpr uint32_t packed_array[256] = { + // PrintCompressNot32x8Tables + 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9, + 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca, + 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9, + 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb, + 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9, + 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba, + 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9, + 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec, + 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9, + 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea, + 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9, + 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb, + 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9, + 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba, + 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9, + 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd, + 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9, + 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca, + 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9, + 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb, + 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9, + 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba, + 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9, + 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc, + 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9, + 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda, + 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9, + 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb, + 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9, + 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba, + 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9, + 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e, + 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9, + 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca, + 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9, + 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db, + 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9, + 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba, + 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9, + 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c, + 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9, + 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a, + 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98}; + + // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31. + // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. + // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing + // latency, it may be faster to use LoadDup128 and PSHUFB. + const auto packed = Set(d32, packed_array[mask_bits]); + alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; + return packed >> Load(d32, shifts); +} + +template +HWY_INLINE Vec256 IndicesFromNotBits(Full256 d, + uint64_t mask_bits) { + const Repartition d32; + + // For 64-bit, we still need 32-bit indices because there is no 64-bit + // permutevar, but there are only 4 lanes, so we can afford to skip the + // unpacking and load the entire index vector directly. + alignas(32) constexpr uint32_t u32_indices[128] = { + // PrintCompressNot64x4PairTables + 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, + 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, + 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13, + 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, + 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15, + 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15}; + return Load(d32, u32_indices + 8 * mask_bits); +} +template +HWY_INLINE Vec256 Compress(Vec256 v, const uint64_t mask_bits) { + const Full256 d; + const Repartition du32; + + HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T)))); + // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is + // no instruction for 4x64). + const Indices256 indices{IndicesFromBits(d, mask_bits).raw}; + return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); +} + +// LUTs are infeasible for 2^16 possible masks, so splice together two +// half-vector Compress. +template +HWY_INLINE Vec256 Compress(Vec256 v, const uint64_t mask_bits) { + const Full256 d; + const RebindToUnsigned du; + const auto vu16 = BitCast(du, v); // (required for float16_t inputs) + const Half duh; + const auto half0 = LowerHalf(duh, vu16); + const auto half1 = UpperHalf(duh, vu16); + + const uint64_t mask_bits0 = mask_bits & 0xFF; + const uint64_t mask_bits1 = mask_bits >> 8; + const auto compressed0 = detail::CompressBits(half0, mask_bits0); + const auto compressed1 = detail::CompressBits(half1, mask_bits1); + + alignas(32) uint16_t all_true[16] = {}; + // Store mask=true lanes, left to right. + const size_t num_true0 = PopCount(mask_bits0); + Store(compressed0, duh, all_true); + StoreU(compressed1, duh, all_true + num_true0); + + if (hwy::HWY_NAMESPACE::CompressIsPartition::value) { + // Store mask=false lanes, right to left. The second vector fills the upper + // half with right-aligned false lanes. The first vector is shifted + // rightwards to overwrite the true lanes of the second. + alignas(32) uint16_t all_false[16] = {}; + const size_t num_true1 = PopCount(mask_bits1); + Store(compressed1, duh, all_false + 8); + StoreU(compressed0, duh, all_false + num_true1); + + const auto mask = FirstN(du, num_true0 + num_true1); + return BitCast(d, + IfThenElse(mask, Load(du, all_true), Load(du, all_false))); + } else { + // Only care about the mask=true lanes. + return BitCast(d, Load(du, all_true)); + } +} + +template +HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { + const Full256 d; + const Repartition du32; + + HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T)))); + // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is + // no instruction for 4x64). + const Indices256 indices{IndicesFromNotBits(d, mask_bits).raw}; + return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); +} + +// LUTs are infeasible for 2^16 possible masks, so splice together two +// half-vector Compress. +template +HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { + // Compress ensures only the lower 16 bits are set, so flip those. + return Compress(v, mask_bits ^ 0xFFFF); +} + +} // namespace detail + +template +HWY_API Vec256 Compress(Vec256 v, Mask256 m) { + return detail::Compress(v, detail::BitsFromMask(m)); +} + +template +HWY_API Vec256 CompressNot(Vec256 v, Mask256 m) { + return detail::CompressNot(v, detail::BitsFromMask(m)); +} + +HWY_API Vec256 CompressBlocksNot(Vec256 v, + Mask256 mask) { + return CompressNot(v, mask); +} + +template +HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes(bits, &mask_bits); + + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + + return detail::Compress(v, mask_bits); +} + +// ------------------------------ CompressStore, CompressBitsStore + +template +HWY_API size_t CompressStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + StoreU(detail::Compress(v, mask_bits), d, unaligned); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressBlendedStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + + const Repartition du32; + HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T)))); + // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is + // no instruction for 4x64). Nibble MSB encodes FirstN. + const Vec256 idx_and_mask = detail::IndicesFromBits(d, mask_bits); + // Shift nibble MSB into MSB + const Mask256 mask32 = MaskFromVec(ShiftLeft<28>(idx_and_mask)); + // First cast to unsigned (RebindMask cannot change lane size) + const Mask256> mask_u{mask32.raw}; + const Mask256 mask = RebindMask(d, mask_u); + const Vec256 compressed = + BitCast(d, TableLookupLanes(BitCast(du32, v), + Indices256{idx_and_mask.raw})); + + BlendedStore(compressed, mask, d, unaligned); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressBlendedStore(Vec256 v, Mask256 m, Full256 d, + T* HWY_RESTRICT unaligned) { + const uint64_t mask_bits = detail::BitsFromMask(m); + const size_t count = PopCount(mask_bits); + const Vec256 compressed = detail::Compress(v, mask_bits); + +#if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN + // BlendedStore tests mask for each lane, but we know that the mask is + // FirstN, so we can just copy. + alignas(32) T buf[16]; + Store(compressed, d, buf); + memcpy(unaligned, buf, count * sizeof(T)); +#else + BlendedStore(compressed, FirstN(d, count), d, unaligned); +#endif + return count; +} + +template +HWY_API size_t CompressBitsStore(Vec256 v, const uint8_t* HWY_RESTRICT bits, + Full256 d, T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + constexpr size_t kNumBytes = (N + 7) / 8; + + uint64_t mask_bits = 0; + CopyBytes(bits, &mask_bits); + + if (N < 8) { + mask_bits &= (1ull << N) - 1; + } + const size_t count = PopCount(mask_bits); + + StoreU(detail::Compress(v, mask_bits), d, unaligned); + // Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +#endif // HWY_TARGET <= HWY_AVX3 + +// ------------------------------ LoadInterleaved3/4 + +// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4. + +namespace detail { + +// Input: +// 1 0 (<- first block of unaligned) +// 3 2 +// 5 4 +// Output: +// 3 0 +// 4 1 +// 5 2 +template +HWY_API void LoadTransposedBlocks3(Full256 d, + const T* HWY_RESTRICT unaligned, + Vec256& A, Vec256& B, Vec256& C) { + constexpr size_t N = 32 / sizeof(T); + const Vec256 v10 = LoadU(d, unaligned + 0 * N); // 1 0 + const Vec256 v32 = LoadU(d, unaligned + 1 * N); + const Vec256 v54 = LoadU(d, unaligned + 2 * N); + + A = ConcatUpperLower(d, v32, v10); + B = ConcatLowerUpper(d, v54, v10); + C = ConcatUpperLower(d, v54, v32); +} + +// Input (128-bit blocks): +// 1 0 (first block of unaligned) +// 3 2 +// 5 4 +// 7 6 +// Output: +// 4 0 (LSB of A) +// 5 1 +// 6 2 +// 7 3 +template +HWY_API void LoadTransposedBlocks4(Full256 d, + const T* HWY_RESTRICT unaligned, + Vec256& A, Vec256& B, Vec256& C, + Vec256& D) { + constexpr size_t N = 32 / sizeof(T); + const Vec256 v10 = LoadU(d, unaligned + 0 * N); + const Vec256 v32 = LoadU(d, unaligned + 1 * N); + const Vec256 v54 = LoadU(d, unaligned + 2 * N); + const Vec256 v76 = LoadU(d, unaligned + 3 * N); + + A = ConcatLowerLower(d, v54, v10); + B = ConcatUpperUpper(d, v54, v10); + C = ConcatLowerLower(d, v76, v32); + D = ConcatUpperUpper(d, v76, v32); +} + +} // namespace detail + +// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower) + +// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. + +namespace detail { + +// Input (128-bit blocks): +// 2 0 (LSB of i) +// 3 1 +// Output: +// 1 0 +// 3 2 +template +HWY_API void StoreTransposedBlocks2(const Vec256 i, const Vec256 j, + const Full256 d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + const auto out0 = ConcatLowerLower(d, j, i); + const auto out1 = ConcatUpperUpper(d, j, i); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); +} + +// Input (128-bit blocks): +// 3 0 (LSB of i) +// 4 1 +// 5 2 +// Output: +// 1 0 +// 3 2 +// 5 4 +template +HWY_API void StoreTransposedBlocks3(const Vec256 i, const Vec256 j, + const Vec256 k, Full256 d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + const auto out0 = ConcatLowerLower(d, j, i); + const auto out1 = ConcatUpperLower(d, i, k); + const auto out2 = ConcatUpperUpper(d, k, j); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + StoreU(out2, d, unaligned + 2 * N); +} + +// Input (128-bit blocks): +// 4 0 (LSB of i) +// 5 1 +// 6 2 +// 7 3 +// Output: +// 1 0 +// 3 2 +// 5 4 +// 7 6 +template +HWY_API void StoreTransposedBlocks4(const Vec256 i, const Vec256 j, + const Vec256 k, const Vec256 l, + Full256 d, T* HWY_RESTRICT unaligned) { + constexpr size_t N = 32 / sizeof(T); + // Write lower halves, then upper. + const auto out0 = ConcatLowerLower(d, j, i); + const auto out1 = ConcatLowerLower(d, l, k); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + const auto out2 = ConcatUpperUpper(d, j, i); + const auto out3 = ConcatUpperUpper(d, l, k); + StoreU(out2, d, unaligned + 2 * N); + StoreU(out3, d, unaligned + 3 * N); +} + +} // namespace detail + +// ------------------------------ Reductions + +namespace detail { + +// Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block. +// Same logic as x86/128.h, but with Vec256 arguments. +template +HWY_INLINE Vec256 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256 v3210) { + const auto v1032 = Shuffle1032(v3210); + const auto v31_20_31_20 = v3210 + v1032; + const auto v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; +} +template +HWY_INLINE Vec256 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256 v3210) { + const auto v1032 = Shuffle1032(v3210); + const auto v31_20_31_20 = Min(v3210, v1032); + const auto v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); +} +template +HWY_INLINE Vec256 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec256 v3210) { + const auto v1032 = Shuffle1032(v3210); + const auto v31_20_31_20 = Max(v3210, v1032); + const auto v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); +} + +template +HWY_INLINE Vec256 SumOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256 v10) { + const auto v01 = Shuffle01(v10); + return v10 + v01; +} +template +HWY_INLINE Vec256 MinOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256 v10) { + const auto v01 = Shuffle01(v10); + return Min(v10, v01); +} +template +HWY_INLINE Vec256 MaxOfLanes(hwy::SizeTag<8> /* tag */, + const Vec256 v10) { + const auto v01 = Shuffle01(v10); + return Max(v10, v01); +} + +HWY_API Vec256 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec256 v) { + const Full256 d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} +HWY_API Vec256 SumOfLanes(hwy::SizeTag<2> /* tag */, + Vec256 v) { + const Full256 d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} + +HWY_API Vec256 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec256 v) { + const Full256 d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +HWY_API Vec256 MinOfLanes(hwy::SizeTag<2> /* tag */, + Vec256 v) { + const Full256 d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +HWY_API Vec256 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec256 v) { + const Full256 d; + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +HWY_API Vec256 MaxOfLanes(hwy::SizeTag<2> /* tag */, + Vec256 v) { + const Full256 d; + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +} // namespace detail + +// Supported for {uif}{32,64},{ui}16. Returns the broadcasted result. +template +HWY_API Vec256 SumOfLanes(Full256 d, const Vec256 vHL) { + const Vec256 vLH = ConcatLowerUpper(d, vHL, vHL); + return detail::SumOfLanes(hwy::SizeTag(), vLH + vHL); +} +template +HWY_API Vec256 MinOfLanes(Full256 d, const Vec256 vHL) { + const Vec256 vLH = ConcatLowerUpper(d, vHL, vHL); + return detail::MinOfLanes(hwy::SizeTag(), Min(vLH, vHL)); +} +template +HWY_API Vec256 MaxOfLanes(Full256 d, const Vec256 vHL) { + const Vec256 vLH = ConcatLowerUpper(d, vHL, vHL); + return detail::MaxOfLanes(hwy::SizeTag(), Max(vLH, vHL)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - +// the warning seems to be issued at the call site of intrinsics, i.e. our code. +HWY_DIAGNOSTICS(pop) diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h new file mode 100644 index 0000000..09b14a9 --- /dev/null +++ b/hwy/ops/x86_512-inl.h @@ -0,0 +1,4412 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// 512-bit AVX512 vectors and operations. +// External include guard in highway.h - see comment there. + +// WARNING: most operations do not cross 128-bit block boundaries. In +// particular, "Broadcast", pack and zip behavior may be surprising. + +// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL +#include "hwy/base.h" + +// Avoid uninitialized warnings in GCC's avx512fintrin.h - see +// https://github.com/google/highway/issues/710) +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") +HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized") +#endif + +#include // AVX2+ + +#if HWY_COMPILER_CLANGCL +// Including should be enough, but Clang's headers helpfully skip +// including these headers when _MSC_VER is defined, like when using clang-cl. +// Include these directly here. +// clang-format off +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// clang-format on +#endif // HWY_COMPILER_CLANGCL + +#include +#include + +#if HWY_IS_MSAN +#include +#endif + +// For half-width vectors. Already includes base.h and shared-inl.h. +#include "hwy/ops/x86_256-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +namespace detail { + +template +struct Raw512 { + using type = __m512i; +}; +template <> +struct Raw512 { + using type = __m512; +}; +template <> +struct Raw512 { + using type = __m512d; +}; + +// Template arg: sizeof(lane type) +template +struct RawMask512 {}; +template <> +struct RawMask512<1> { + using type = __mmask64; +}; +template <> +struct RawMask512<2> { + using type = __mmask32; +}; +template <> +struct RawMask512<4> { + using type = __mmask16; +}; +template <> +struct RawMask512<8> { + using type = __mmask8; +}; + +} // namespace detail + +template +class Vec512 { + using Raw = typename detail::Raw512::type; + + public: + // Compound assignment. Only usable if there is a corresponding non-member + // binary operator overload. For example, only f32 and f64 support division. + HWY_INLINE Vec512& operator*=(const Vec512 other) { + return *this = (*this * other); + } + HWY_INLINE Vec512& operator/=(const Vec512 other) { + return *this = (*this / other); + } + HWY_INLINE Vec512& operator+=(const Vec512 other) { + return *this = (*this + other); + } + HWY_INLINE Vec512& operator-=(const Vec512 other) { + return *this = (*this - other); + } + HWY_INLINE Vec512& operator&=(const Vec512 other) { + return *this = (*this & other); + } + HWY_INLINE Vec512& operator|=(const Vec512 other) { + return *this = (*this | other); + } + HWY_INLINE Vec512& operator^=(const Vec512 other) { + return *this = (*this ^ other); + } + + Raw raw; +}; + +// Mask register: one bit per lane. +template +struct Mask512 { + typename detail::RawMask512::type raw; +}; + +// ------------------------------ BitCast + +namespace detail { + +HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; } +HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); } +HWY_INLINE __m512i BitCastToInteger(__m512d v) { + return _mm512_castpd_si512(v); +} + +template +HWY_INLINE Vec512 BitCastToByte(Vec512 v) { + return Vec512{BitCastToInteger(v.raw)}; +} + +// Cannot rely on function overloading because return types differ. +template +struct BitCastFromInteger512 { + HWY_INLINE __m512i operator()(__m512i v) { return v; } +}; +template <> +struct BitCastFromInteger512 { + HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); } +}; +template <> +struct BitCastFromInteger512 { + HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); } +}; + +template +HWY_INLINE Vec512 BitCastFromByte(Full512 /* tag */, Vec512 v) { + return Vec512{BitCastFromInteger512()(v.raw)}; +} + +} // namespace detail + +template +HWY_API Vec512 BitCast(Full512 d, Vec512 v) { + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); +} + +// ------------------------------ Set + +// Returns an all-zero vector. +template +HWY_API Vec512 Zero(Full512 /* tag */) { + return Vec512{_mm512_setzero_si512()}; +} +HWY_API Vec512 Zero(Full512 /* tag */) { + return Vec512{_mm512_setzero_ps()}; +} +HWY_API Vec512 Zero(Full512 /* tag */) { + return Vec512{_mm512_setzero_pd()}; +} + +// Returns a vector with all lanes set to "t". +HWY_API Vec512 Set(Full512 /* tag */, const uint8_t t) { + return Vec512{_mm512_set1_epi8(static_cast(t))}; // NOLINT +} +HWY_API Vec512 Set(Full512 /* tag */, const uint16_t t) { + return Vec512{_mm512_set1_epi16(static_cast(t))}; // NOLINT +} +HWY_API Vec512 Set(Full512 /* tag */, const uint32_t t) { + return Vec512{_mm512_set1_epi32(static_cast(t))}; +} +HWY_API Vec512 Set(Full512 /* tag */, const uint64_t t) { + return Vec512{ + _mm512_set1_epi64(static_cast(t))}; // NOLINT +} +HWY_API Vec512 Set(Full512 /* tag */, const int8_t t) { + return Vec512{_mm512_set1_epi8(static_cast(t))}; // NOLINT +} +HWY_API Vec512 Set(Full512 /* tag */, const int16_t t) { + return Vec512{_mm512_set1_epi16(static_cast(t))}; // NOLINT +} +HWY_API Vec512 Set(Full512 /* tag */, const int32_t t) { + return Vec512{_mm512_set1_epi32(t)}; +} +HWY_API Vec512 Set(Full512 /* tag */, const int64_t t) { + return Vec512{ + _mm512_set1_epi64(static_cast(t))}; // NOLINT +} +HWY_API Vec512 Set(Full512 /* tag */, const float t) { + return Vec512{_mm512_set1_ps(t)}; +} +HWY_API Vec512 Set(Full512 /* tag */, const double t) { + return Vec512{_mm512_set1_pd(t)}; +} + +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") + +// Returns a vector with uninitialized elements. +template +HWY_API Vec512 Undefined(Full512 /* tag */) { + // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC + // generate an XOR instruction. + return Vec512{_mm512_undefined_epi32()}; +} +HWY_API Vec512 Undefined(Full512 /* tag */) { + return Vec512{_mm512_undefined_ps()}; +} +HWY_API Vec512 Undefined(Full512 /* tag */) { + return Vec512{_mm512_undefined_pd()}; +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== LOGICAL + +// ------------------------------ Not + +template +HWY_API Vec512 Not(const Vec512 v) { + using TU = MakeUnsigned; + const __m512i vu = BitCast(Full512(), v).raw; + return BitCast(Full512(), + Vec512{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)}); +} + +// ------------------------------ And + +template +HWY_API Vec512 And(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_and_si512(a.raw, b.raw)}; +} + +HWY_API Vec512 And(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_and_ps(a.raw, b.raw)}; +} +HWY_API Vec512 And(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_and_pd(a.raw, b.raw)}; +} + +// ------------------------------ AndNot + +// Returns ~not_mask & mask. +template +HWY_API Vec512 AndNot(const Vec512 not_mask, const Vec512 mask) { + return Vec512{_mm512_andnot_si512(not_mask.raw, mask.raw)}; +} +HWY_API Vec512 AndNot(const Vec512 not_mask, + const Vec512 mask) { + return Vec512{_mm512_andnot_ps(not_mask.raw, mask.raw)}; +} +HWY_API Vec512 AndNot(const Vec512 not_mask, + const Vec512 mask) { + return Vec512{_mm512_andnot_pd(not_mask.raw, mask.raw)}; +} + +// ------------------------------ Or + +template +HWY_API Vec512 Or(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_or_si512(a.raw, b.raw)}; +} + +HWY_API Vec512 Or(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_or_ps(a.raw, b.raw)}; +} +HWY_API Vec512 Or(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_or_pd(a.raw, b.raw)}; +} + +// ------------------------------ Xor + +template +HWY_API Vec512 Xor(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_xor_si512(a.raw, b.raw)}; +} + +HWY_API Vec512 Xor(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_xor_ps(a.raw, b.raw)}; +} +HWY_API Vec512 Xor(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_xor_pd(a.raw, b.raw)}; +} + +// ------------------------------ Or3 + +template +HWY_API Vec512 Or3(Vec512 o1, Vec512 o2, Vec512 o3) { + const Full512 d; + const RebindToUnsigned du; + using VU = VFromD; + const __m512i ret = _mm512_ternarylogic_epi64( + BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); + return BitCast(d, VU{ret}); +} + +// ------------------------------ OrAnd + +template +HWY_API Vec512 OrAnd(Vec512 o, Vec512 a1, Vec512 a2) { + const Full512 d; + const RebindToUnsigned du; + using VU = VFromD; + const __m512i ret = _mm512_ternarylogic_epi64( + BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); + return BitCast(d, VU{ret}); +} + +// ------------------------------ IfVecThenElse + +template +HWY_API Vec512 IfVecThenElse(Vec512 mask, Vec512 yes, Vec512 no) { + const Full512 d; + const RebindToUnsigned du; + using VU = VFromD; + return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw, + BitCast(du, yes).raw, + BitCast(du, no).raw, 0xCA)}); +} + +// ------------------------------ Operator overloads (internal-only if float) + +template +HWY_API Vec512 operator&(const Vec512 a, const Vec512 b) { + return And(a, b); +} + +template +HWY_API Vec512 operator|(const Vec512 a, const Vec512 b) { + return Or(a, b); +} + +template +HWY_API Vec512 operator^(const Vec512 a, const Vec512 b) { + return Xor(a, b); +} + +// ------------------------------ PopulationCount + +// 8/16 require BITALG, 32/64 require VPOPCNTDQ. +#if HWY_TARGET == HWY_AVX3_DL + +#ifdef HWY_NATIVE_POPCNT +#undef HWY_NATIVE_POPCNT +#else +#define HWY_NATIVE_POPCNT +#endif + +namespace detail { + +template +HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<1> /* tag */, Vec512 v) { + return Vec512{_mm512_popcnt_epi8(v.raw)}; +} +template +HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<2> /* tag */, Vec512 v) { + return Vec512{_mm512_popcnt_epi16(v.raw)}; +} +template +HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<4> /* tag */, Vec512 v) { + return Vec512{_mm512_popcnt_epi32(v.raw)}; +} +template +HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<8> /* tag */, Vec512 v) { + return Vec512{_mm512_popcnt_epi64(v.raw)}; +} + +} // namespace detail + +template +HWY_API Vec512 PopulationCount(Vec512 v) { + return detail::PopulationCount(hwy::SizeTag(), v); +} + +#endif // HWY_TARGET == HWY_AVX3_DL + +// ================================================== SIGN + +// ------------------------------ CopySign + +template +HWY_API Vec512 CopySign(const Vec512 magn, const Vec512 sign) { + static_assert(IsFloat(), "Only makes sense for floating-point"); + + const Full512 d; + const auto msb = SignBit(d); + + const Rebind, decltype(d)> du; + // Truth table for msb, magn, sign | bitwise msb ? sign : mag + // 0 0 0 | 0 + // 0 0 1 | 0 + // 0 1 0 | 1 + // 0 1 1 | 1 + // 1 0 0 | 0 + // 1 0 1 | 1 + // 1 1 0 | 0 + // 1 1 1 | 1 + // The lane size does not matter because we are not using predication. + const __m512i out = _mm512_ternarylogic_epi32( + BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC); + return BitCast(d, decltype(Zero(du)){out}); +} + +template +HWY_API Vec512 CopySignToAbs(const Vec512 abs, const Vec512 sign) { + // AVX3 can also handle abs < 0, so no extra action needed. + return CopySign(abs, sign); +} + +// ================================================== MASK + +// ------------------------------ FirstN + +// Possibilities for constructing a bitmask of N ones: +// - kshift* only consider the lowest byte of the shift count, so they would +// not correctly handle large n. +// - Scalar shifts >= 64 are UB. +// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However, +// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds. + +#if HWY_ARCH_X86_32 +namespace detail { + +// 32 bit mask is sufficient for lane size >= 2. +template +HWY_INLINE Mask512 FirstN(size_t n) { + Mask512 m; + const uint32_t all = ~uint32_t{0}; + // BZHI only looks at the lower 8 bits of n! + m.raw = static_cast((n > 255) ? all : _bzhi_u32(all, n)); + return m; +} + +template +HWY_INLINE Mask512 FirstN(size_t n) { + const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0}; + return Mask512{static_cast<__mmask64>(bits)}; +} + +} // namespace detail +#endif // HWY_ARCH_X86_32 + +template +HWY_API Mask512 FirstN(const Full512 /*tag*/, size_t n) { +#if HWY_ARCH_X86_64 + Mask512 m; + const uint64_t all = ~uint64_t{0}; + // BZHI only looks at the lower 8 bits of n! + m.raw = static_cast((n > 255) ? all : _bzhi_u64(all, n)); + return m; +#else + return detail::FirstN(n); +#endif // HWY_ARCH_X86_64 +} + +// ------------------------------ IfThenElse + +// Returns mask ? b : a. + +namespace detail { + +// Templates for signed/unsigned integer of a particular size. +template +HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<1> /* tag */, + const Mask512 mask, const Vec512 yes, + const Vec512 no) { + return Vec512{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<2> /* tag */, + const Mask512 mask, const Vec512 yes, + const Vec512 no) { + return Vec512{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<4> /* tag */, + const Mask512 mask, const Vec512 yes, + const Vec512 no) { + return Vec512{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<8> /* tag */, + const Mask512 mask, const Vec512 yes, + const Vec512 no) { + return Vec512{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)}; +} + +} // namespace detail + +template +HWY_API Vec512 IfThenElse(const Mask512 mask, const Vec512 yes, + const Vec512 no) { + return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); +} +HWY_API Vec512 IfThenElse(const Mask512 mask, + const Vec512 yes, + const Vec512 no) { + return Vec512{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)}; +} +HWY_API Vec512 IfThenElse(const Mask512 mask, + const Vec512 yes, + const Vec512 no) { + return Vec512{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)}; +} + +namespace detail { + +template +HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<1> /* tag */, + const Mask512 mask, + const Vec512 yes) { + return Vec512{_mm512_maskz_mov_epi8(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<2> /* tag */, + const Mask512 mask, + const Vec512 yes) { + return Vec512{_mm512_maskz_mov_epi16(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<4> /* tag */, + const Mask512 mask, + const Vec512 yes) { + return Vec512{_mm512_maskz_mov_epi32(mask.raw, yes.raw)}; +} +template +HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<8> /* tag */, + const Mask512 mask, + const Vec512 yes) { + return Vec512{_mm512_maskz_mov_epi64(mask.raw, yes.raw)}; +} + +} // namespace detail + +template +HWY_API Vec512 IfThenElseZero(const Mask512 mask, const Vec512 yes) { + return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); +} +HWY_API Vec512 IfThenElseZero(const Mask512 mask, + const Vec512 yes) { + return Vec512{_mm512_maskz_mov_ps(mask.raw, yes.raw)}; +} +HWY_API Vec512 IfThenElseZero(const Mask512 mask, + const Vec512 yes) { + return Vec512{_mm512_maskz_mov_pd(mask.raw, yes.raw)}; +} + +namespace detail { + +template +HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<1> /* tag */, + const Mask512 mask, const Vec512 no) { + // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. + return Vec512{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<2> /* tag */, + const Mask512 mask, const Vec512 no) { + return Vec512{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<4> /* tag */, + const Mask512 mask, const Vec512 no) { + return Vec512{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; +} +template +HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<8> /* tag */, + const Mask512 mask, const Vec512 no) { + return Vec512{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; +} + +} // namespace detail + +template +HWY_API Vec512 IfThenZeroElse(const Mask512 mask, const Vec512 no) { + return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); +} +HWY_API Vec512 IfThenZeroElse(const Mask512 mask, + const Vec512 no) { + return Vec512{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; +} +HWY_API Vec512 IfThenZeroElse(const Mask512 mask, + const Vec512 no) { + return Vec512{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; +} + +template +HWY_API Vec512 IfNegativeThenElse(Vec512 v, Vec512 yes, Vec512 no) { + static_assert(IsSigned(), "Only works for signed/float"); + // AVX3 MaskFromVec only looks at the MSB + return IfThenElse(MaskFromVec(v), yes, no); +} + +template +HWY_API Vec512 ZeroIfNegative(const Vec512 v) { + // AVX3 MaskFromVec only looks at the MSB + return IfThenZeroElse(MaskFromVec(v), v); +} + +// ================================================== ARITHMETIC + +// ------------------------------ Addition + +// Unsigned +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512 operator+(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_add_ps(a.raw, b.raw)}; +} +HWY_API Vec512 operator+(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_add_pd(a.raw, b.raw)}; +} + +// ------------------------------ Subtraction + +// Unsigned +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512 operator-(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_sub_ps(a.raw, b.raw)}; +} +HWY_API Vec512 operator-(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_sub_pd(a.raw, b.raw)}; +} + +// ------------------------------ SumsOf8 +HWY_API Vec512 SumsOf8(const Vec512 v) { + return Vec512{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())}; +} + +// ------------------------------ SaturatedAdd + +// Returns a + b clamped to the destination range. + +// Unsigned +HWY_API Vec512 SaturatedAdd(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_adds_epu8(a.raw, b.raw)}; +} +HWY_API Vec512 SaturatedAdd(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_adds_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512 SaturatedAdd(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_adds_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 SaturatedAdd(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_adds_epi16(a.raw, b.raw)}; +} + +// ------------------------------ SaturatedSub + +// Returns a - b clamped to the destination range. + +// Unsigned +HWY_API Vec512 SaturatedSub(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_subs_epu8(a.raw, b.raw)}; +} +HWY_API Vec512 SaturatedSub(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_subs_epu16(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512 SaturatedSub(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_subs_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 SaturatedSub(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_subs_epi16(a.raw, b.raw)}; +} + +// ------------------------------ Average + +// Returns (a + b + 1) / 2 + +// Unsigned +HWY_API Vec512 AverageRound(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_avg_epu8(a.raw, b.raw)}; +} +HWY_API Vec512 AverageRound(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_avg_epu16(a.raw, b.raw)}; +} + +// ------------------------------ Abs (Sub) + +// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. +HWY_API Vec512 Abs(const Vec512 v) { +#if HWY_COMPILER_MSVC + // Workaround for incorrect codegen? (untested due to internal compiler error) + const auto zero = Zero(Full512()); + return Vec512{_mm512_max_epi8(v.raw, (zero - v).raw)}; +#else + return Vec512{_mm512_abs_epi8(v.raw)}; +#endif +} +HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_epi16(v.raw)}; +} +HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_epi32(v.raw)}; +} +HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_epi64(v.raw)}; +} + +// These aren't native instructions, they also involve AND with constant. +HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_ps(v.raw)}; +} +HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_pd(v.raw)}; +} +// ------------------------------ ShiftLeft + +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + return Vec512{_mm512_slli_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + return Vec512{_mm512_slli_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + return Vec512{_mm512_slli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + return Vec512{_mm512_slli_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + return Vec512{_mm512_slli_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + return Vec512{_mm512_slli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + const Full512 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + +// ------------------------------ ShiftRight + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + return Vec512{_mm512_srli_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + return Vec512{_mm512_srli_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + return Vec512{_mm512_srli_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + const Full512 d8; + // Use raw instead of BitCast to support N=1. + const Vec512 shifted{ShiftRight(Vec512{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + return Vec512{_mm512_srai_epi16(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + return Vec512{_mm512_srai_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + return Vec512{_mm512_srai_epi64(v.raw, kBits)}; +} + +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + const Full512 di; + const Full512 du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ RotateRight + +template +HWY_API Vec512 RotateRight(const Vec512 v) { + static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); + return Vec512{_mm512_ror_epi32(v.raw, kBits)}; +} + +template +HWY_API Vec512 RotateRight(const Vec512 v) { + static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); + return Vec512{_mm512_ror_epi64(v.raw, kBits)}; +} + +// ------------------------------ ShiftLeftSame + +HWY_API Vec512 ShiftLeftSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512 ShiftLeftSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512 ShiftLeftSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { + return Vec512{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { + return Vec512{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { + return Vec512{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +template +HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { + const Full512 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); +} + +// ------------------------------ ShiftRightSame + +HWY_API Vec512 ShiftRightSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512 ShiftRightSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512 ShiftRightSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { + const Full512 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); + return shifted & Set(d8, static_cast(0xFF >> bits)); +} + +HWY_API Vec512 ShiftRightSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512 ShiftRightSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; +} +HWY_API Vec512 ShiftRightSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; +} + +HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { + const Full512 di; + const Full512 du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = + BitCast(di, Set(du, static_cast(0x80 >> bits))); + return (shifted ^ shifted_sign) - shifted_sign; +} + +// ------------------------------ Shl + +HWY_API Vec512 operator<<(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_sllv_epi16(v.raw, bits.raw)}; +} + +HWY_API Vec512 operator<<(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_sllv_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec512 operator<<(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_sllv_epi64(v.raw, bits.raw)}; +} + +// Signed left shift is the same as unsigned. +template +HWY_API Vec512 operator<<(const Vec512 v, const Vec512 bits) { + const Full512 di; + const Full512> du; + return BitCast(di, BitCast(du, v) << BitCast(du, bits)); +} + +// ------------------------------ Shr + +HWY_API Vec512 operator>>(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_srlv_epi16(v.raw, bits.raw)}; +} + +HWY_API Vec512 operator>>(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_srlv_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec512 operator>>(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_srlv_epi64(v.raw, bits.raw)}; +} + +HWY_API Vec512 operator>>(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_srav_epi16(v.raw, bits.raw)}; +} + +HWY_API Vec512 operator>>(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_srav_epi32(v.raw, bits.raw)}; +} + +HWY_API Vec512 operator>>(const Vec512 v, + const Vec512 bits) { + return Vec512{_mm512_srav_epi64(v.raw, bits.raw)}; +} + +// ------------------------------ Minimum + +// Unsigned +HWY_API Vec512 Min(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_min_epu8(a.raw, b.raw)}; +} +HWY_API Vec512 Min(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_min_epu16(a.raw, b.raw)}; +} +HWY_API Vec512 Min(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_min_epu32(a.raw, b.raw)}; +} +HWY_API Vec512 Min(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_min_epu64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512 Min(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_min_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 Min(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_min_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 Min(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_min_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 Min(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_min_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512 Min(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_min_ps(a.raw, b.raw)}; +} +HWY_API Vec512 Min(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_min_pd(a.raw, b.raw)}; +} + +// ------------------------------ Maximum + +// Unsigned +HWY_API Vec512 Max(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_max_epu8(a.raw, b.raw)}; +} +HWY_API Vec512 Max(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_max_epu16(a.raw, b.raw)}; +} +HWY_API Vec512 Max(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_max_epu32(a.raw, b.raw)}; +} +HWY_API Vec512 Max(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_max_epu64(a.raw, b.raw)}; +} + +// Signed +HWY_API Vec512 Max(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_max_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 Max(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_max_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 Max(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_max_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 Max(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_max_epi64(a.raw, b.raw)}; +} + +// Float +HWY_API Vec512 Max(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_max_ps(a.raw, b.raw)}; +} +HWY_API Vec512 Max(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_max_pd(a.raw, b.raw)}; +} + +// ------------------------------ Integer multiplication + +// Unsigned +HWY_API Vec512 operator*(Vec512 a, Vec512 b) { + return Vec512{_mm512_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 operator*(Vec512 a, Vec512 b) { + return Vec512{_mm512_mullo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 operator*(Vec512 a, Vec512 b) { + return Vec512{_mm512_mullo_epi64(a.raw, b.raw)}; +} +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; +} +HWY_API Vec128 operator*(Vec128 a, Vec128 b) { + return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; +} + +// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*. +#ifdef HWY_NATIVE_I64MULLO +#undef HWY_NATIVE_I64MULLO +#else +#define HWY_NATIVE_I64MULLO +#endif + +// Signed +HWY_API Vec512 operator*(Vec512 a, Vec512 b) { + return Vec512{_mm512_mullo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 operator*(Vec512 a, Vec512 b) { + return Vec512{_mm512_mullo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 operator*(Vec512 a, Vec512 b) { + return Vec512{_mm512_mullo_epi64(a.raw, b.raw)}; +} +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; +} +HWY_API Vec128 operator*(Vec128 a, Vec128 b) { + return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; +} +// Returns the upper 16 bits of a * b in each lane. +HWY_API Vec512 MulHigh(Vec512 a, Vec512 b) { + return Vec512{_mm512_mulhi_epu16(a.raw, b.raw)}; +} +HWY_API Vec512 MulHigh(Vec512 a, Vec512 b) { + return Vec512{_mm512_mulhi_epi16(a.raw, b.raw)}; +} + +HWY_API Vec512 MulFixedPoint15(Vec512 a, Vec512 b) { + return Vec512{_mm512_mulhrs_epi16(a.raw, b.raw)}; +} + +// Multiplies even lanes (0, 2 ..) and places the double-wide result into +// even and the upper half into its odd neighbor lane. +HWY_API Vec512 MulEven(Vec512 a, Vec512 b) { + return Vec512{_mm512_mul_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 MulEven(Vec512 a, Vec512 b) { + return Vec512{_mm512_mul_epu32(a.raw, b.raw)}; +} + +// ------------------------------ Neg (Sub) + +template +HWY_API Vec512 Neg(const Vec512 v) { + return Xor(v, SignBit(Full512())); +} + +template +HWY_API Vec512 Neg(const Vec512 v) { + return Zero(Full512()) - v; +} + +// ------------------------------ Floating-point mul / div + +HWY_API Vec512 operator*(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_mul_ps(a.raw, b.raw)}; +} +HWY_API Vec512 operator*(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_mul_pd(a.raw, b.raw)}; +} + +HWY_API Vec512 operator/(const Vec512 a, const Vec512 b) { + return Vec512{_mm512_div_ps(a.raw, b.raw)}; +} +HWY_API Vec512 operator/(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_div_pd(a.raw, b.raw)}; +} + +// Approximate reciprocal +HWY_API Vec512 ApproximateReciprocal(const Vec512 v) { + return Vec512{_mm512_rcp14_ps(v.raw)}; +} + +// Absolute value of difference. +HWY_API Vec512 AbsDiff(const Vec512 a, const Vec512 b) { + return Abs(a - b); +} + +// ------------------------------ Floating-point multiply-add variants + +// Returns mul * x + add +HWY_API Vec512 MulAdd(const Vec512 mul, const Vec512 x, + const Vec512 add) { + return Vec512{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)}; +} +HWY_API Vec512 MulAdd(const Vec512 mul, const Vec512 x, + const Vec512 add) { + return Vec512{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)}; +} + +// Returns add - mul * x +HWY_API Vec512 NegMulAdd(const Vec512 mul, const Vec512 x, + const Vec512 add) { + return Vec512{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)}; +} +HWY_API Vec512 NegMulAdd(const Vec512 mul, + const Vec512 x, + const Vec512 add) { + return Vec512{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)}; +} + +// Returns mul * x - sub +HWY_API Vec512 MulSub(const Vec512 mul, const Vec512 x, + const Vec512 sub) { + return Vec512{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)}; +} +HWY_API Vec512 MulSub(const Vec512 mul, const Vec512 x, + const Vec512 sub) { + return Vec512{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)}; +} + +// Returns -mul * x - sub +HWY_API Vec512 NegMulSub(const Vec512 mul, const Vec512 x, + const Vec512 sub) { + return Vec512{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)}; +} +HWY_API Vec512 NegMulSub(const Vec512 mul, + const Vec512 x, + const Vec512 sub) { + return Vec512{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)}; +} + +// ------------------------------ Floating-point square root + +// Full precision square root +HWY_API Vec512 Sqrt(const Vec512 v) { + return Vec512{_mm512_sqrt_ps(v.raw)}; +} +HWY_API Vec512 Sqrt(const Vec512 v) { + return Vec512{_mm512_sqrt_pd(v.raw)}; +} + +// Approximate reciprocal square root +HWY_API Vec512 ApproximateReciprocalSqrt(const Vec512 v) { + return Vec512{_mm512_rsqrt14_ps(v.raw)}; +} + +// ------------------------------ Floating-point rounding + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +// Toward nearest integer, tie to even +HWY_API Vec512 Round(const Vec512 v) { + return Vec512{_mm512_roundscale_ps( + v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512 Round(const Vec512 v) { + return Vec512{_mm512_roundscale_pd( + v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; +} + +// Toward zero, aka truncate +HWY_API Vec512 Trunc(const Vec512 v) { + return Vec512{ + _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512 Trunc(const Vec512 v) { + return Vec512{ + _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; +} + +// Toward +infinity, aka ceiling +HWY_API Vec512 Ceil(const Vec512 v) { + return Vec512{ + _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512 Ceil(const Vec512 v) { + return Vec512{ + _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; +} + +// Toward -infinity, aka floor +HWY_API Vec512 Floor(const Vec512 v) { + return Vec512{ + _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} +HWY_API Vec512 Floor(const Vec512 v) { + return Vec512{ + _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== COMPARE + +// Comparisons set a mask bit to 1 if the condition is true, else 0. + +template +HWY_API Mask512 RebindMask(Full512 /*tag*/, Mask512 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return Mask512{m.raw}; +} + +namespace detail { + +template +HWY_INLINE Mask512 TestBit(hwy::SizeTag<1> /*tag*/, const Vec512 v, + const Vec512 bit) { + return Mask512{_mm512_test_epi8_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask512 TestBit(hwy::SizeTag<2> /*tag*/, const Vec512 v, + const Vec512 bit) { + return Mask512{_mm512_test_epi16_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask512 TestBit(hwy::SizeTag<4> /*tag*/, const Vec512 v, + const Vec512 bit) { + return Mask512{_mm512_test_epi32_mask(v.raw, bit.raw)}; +} +template +HWY_INLINE Mask512 TestBit(hwy::SizeTag<8> /*tag*/, const Vec512 v, + const Vec512 bit) { + return Mask512{_mm512_test_epi64_mask(v.raw, bit.raw)}; +} + +} // namespace detail + +template +HWY_API Mask512 TestBit(const Vec512 v, const Vec512 bit) { + static_assert(!hwy::IsFloat(), "Only integer vectors supported"); + return detail::TestBit(hwy::SizeTag(), v, bit); +} + +// ------------------------------ Equality + +template +HWY_API Mask512 operator==(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpeq_epi8_mask(a.raw, b.raw)}; +} +template +HWY_API Mask512 operator==(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpeq_epi16_mask(a.raw, b.raw)}; +} +template +HWY_API Mask512 operator==(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpeq_epi32_mask(a.raw, b.raw)}; +} +template +HWY_API Mask512 operator==(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpeq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512 operator==(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +HWY_API Mask512 operator==(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; +} + +// ------------------------------ Inequality + +template +HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpneq_epi8_mask(a.raw, b.raw)}; +} +template +HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpneq_epi16_mask(a.raw, b.raw)}; +} +template +HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpneq_epi32_mask(a.raw, b.raw)}; +} +template +HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpneq_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; +} + +// ------------------------------ Strict inequality + +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epu8_mask(a.raw, b.raw)}; +} +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epu16_mask(a.raw, b.raw)}; +} +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epu32_mask(a.raw, b.raw)}; +} +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epu64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epi8_mask(a.raw, b.raw)}; +} +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epi16_mask(a.raw, b.raw)}; +} +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epi32_mask(a.raw, b.raw)}; +} +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmpgt_epi64_mask(a.raw, b.raw)}; +} + +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} +HWY_API Mask512 operator>(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; +} + +// ------------------------------ Weak inequality + +HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} +HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { + return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; +} + +// ------------------------------ Reversed comparisons + +template +HWY_API Mask512 operator<(Vec512 a, Vec512 b) { + return b > a; +} + +template +HWY_API Mask512 operator<=(Vec512 a, Vec512 b) { + return b >= a; +} + +// ------------------------------ Mask + +namespace detail { + +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec512 v) { + return Mask512{_mm512_movepi8_mask(v.raw)}; +} +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec512 v) { + return Mask512{_mm512_movepi16_mask(v.raw)}; +} +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec512 v) { + return Mask512{_mm512_movepi32_mask(v.raw)}; +} +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec512 v) { + return Mask512{_mm512_movepi64_mask(v.raw)}; +} + +} // namespace detail + +template +HWY_API Mask512 MaskFromVec(const Vec512 v) { + return detail::MaskFromVec(hwy::SizeTag(), v); +} +// There do not seem to be native floating-point versions of these instructions. +HWY_API Mask512 MaskFromVec(const Vec512 v) { + return Mask512{MaskFromVec(BitCast(Full512(), v)).raw}; +} +HWY_API Mask512 MaskFromVec(const Vec512 v) { + return Mask512{MaskFromVec(BitCast(Full512(), v)).raw}; +} + +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi8(v.raw)}; +} +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi8(v.raw)}; +} + +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi16(v.raw)}; +} +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi16(v.raw)}; +} + +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi32(v.raw)}; +} +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi32(v.raw)}; +} +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))}; +} + +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi64(v.raw)}; +} +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_movm_epi64(v.raw)}; +} +HWY_API Vec512 VecFromMask(const Mask512 v) { + return Vec512{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))}; +} + +template +HWY_API Vec512 VecFromMask(Full512 /* tag */, const Mask512 v) { + return VecFromMask(v); +} + +// ------------------------------ Mask logical + +namespace detail { + +template +HWY_INLINE Mask512 Not(hwy::SizeTag<1> /*tag*/, const Mask512 m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_knot_mask64(m.raw)}; +#else + return Mask512{~m.raw}; +#endif +} +template +HWY_INLINE Mask512 Not(hwy::SizeTag<2> /*tag*/, const Mask512 m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_knot_mask32(m.raw)}; +#else + return Mask512{~m.raw}; +#endif +} +template +HWY_INLINE Mask512 Not(hwy::SizeTag<4> /*tag*/, const Mask512 m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_knot_mask16(m.raw)}; +#else + return Mask512{static_cast(~m.raw & 0xFFFF)}; +#endif +} +template +HWY_INLINE Mask512 Not(hwy::SizeTag<8> /*tag*/, const Mask512 m) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_knot_mask8(m.raw)}; +#else + return Mask512{static_cast(~m.raw & 0xFF)}; +#endif +} + +template +HWY_INLINE Mask512 And(hwy::SizeTag<1> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kand_mask64(a.raw, b.raw)}; +#else + return Mask512{a.raw & b.raw}; +#endif +} +template +HWY_INLINE Mask512 And(hwy::SizeTag<2> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kand_mask32(a.raw, b.raw)}; +#else + return Mask512{a.raw & b.raw}; +#endif +} +template +HWY_INLINE Mask512 And(hwy::SizeTag<4> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kand_mask16(a.raw, b.raw)}; +#else + return Mask512{static_cast(a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask512 And(hwy::SizeTag<8> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kand_mask8(a.raw, b.raw)}; +#else + return Mask512{static_cast(a.raw & b.raw)}; +#endif +} + +template +HWY_INLINE Mask512 AndNot(hwy::SizeTag<1> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kandn_mask64(a.raw, b.raw)}; +#else + return Mask512{~a.raw & b.raw}; +#endif +} +template +HWY_INLINE Mask512 AndNot(hwy::SizeTag<2> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kandn_mask32(a.raw, b.raw)}; +#else + return Mask512{~a.raw & b.raw}; +#endif +} +template +HWY_INLINE Mask512 AndNot(hwy::SizeTag<4> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kandn_mask16(a.raw, b.raw)}; +#else + return Mask512{static_cast(~a.raw & b.raw)}; +#endif +} +template +HWY_INLINE Mask512 AndNot(hwy::SizeTag<8> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kandn_mask8(a.raw, b.raw)}; +#else + return Mask512{static_cast(~a.raw & b.raw)}; +#endif +} + +template +HWY_INLINE Mask512 Or(hwy::SizeTag<1> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kor_mask64(a.raw, b.raw)}; +#else + return Mask512{a.raw | b.raw}; +#endif +} +template +HWY_INLINE Mask512 Or(hwy::SizeTag<2> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kor_mask32(a.raw, b.raw)}; +#else + return Mask512{a.raw | b.raw}; +#endif +} +template +HWY_INLINE Mask512 Or(hwy::SizeTag<4> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kor_mask16(a.raw, b.raw)}; +#else + return Mask512{static_cast(a.raw | b.raw)}; +#endif +} +template +HWY_INLINE Mask512 Or(hwy::SizeTag<8> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kor_mask8(a.raw, b.raw)}; +#else + return Mask512{static_cast(a.raw | b.raw)}; +#endif +} + +template +HWY_INLINE Mask512 Xor(hwy::SizeTag<1> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxor_mask64(a.raw, b.raw)}; +#else + return Mask512{a.raw ^ b.raw}; +#endif +} +template +HWY_INLINE Mask512 Xor(hwy::SizeTag<2> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxor_mask32(a.raw, b.raw)}; +#else + return Mask512{a.raw ^ b.raw}; +#endif +} +template +HWY_INLINE Mask512 Xor(hwy::SizeTag<4> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxor_mask16(a.raw, b.raw)}; +#else + return Mask512{static_cast(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask512 Xor(hwy::SizeTag<8> /*tag*/, const Mask512 a, + const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxor_mask8(a.raw, b.raw)}; +#else + return Mask512{static_cast(a.raw ^ b.raw)}; +#endif +} + +template +HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, + const Mask512 a, const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxnor_mask64(a.raw, b.raw)}; +#else + return Mask512{~(a.raw ^ b.raw)}; +#endif +} +template +HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, + const Mask512 a, const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxnor_mask32(a.raw, b.raw)}; +#else + return Mask512{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)}; +#endif +} +template +HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, + const Mask512 a, const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxnor_mask16(a.raw, b.raw)}; +#else + return Mask512{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; +#endif +} +template +HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, + const Mask512 a, const Mask512 b) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return Mask512{_kxnor_mask8(a.raw, b.raw)}; +#else + return Mask512{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; +#endif +} + +} // namespace detail + +template +HWY_API Mask512 Not(const Mask512 m) { + return detail::Not(hwy::SizeTag(), m); +} + +template +HWY_API Mask512 And(const Mask512 a, Mask512 b) { + return detail::And(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask512 AndNot(const Mask512 a, Mask512 b) { + return detail::AndNot(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask512 Or(const Mask512 a, Mask512 b) { + return detail::Or(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask512 Xor(const Mask512 a, Mask512 b) { + return detail::Xor(hwy::SizeTag(), a, b); +} + +template +HWY_API Mask512 ExclusiveNeither(const Mask512 a, Mask512 b) { + return detail::ExclusiveNeither(hwy::SizeTag(), a, b); +} + +// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) + +HWY_API Vec512 BroadcastSignBit(const Vec512 v) { + return VecFromMask(v < Zero(Full512())); +} + +HWY_API Vec512 BroadcastSignBit(const Vec512 v) { + return ShiftRight<15>(v); +} + +HWY_API Vec512 BroadcastSignBit(const Vec512 v) { + return ShiftRight<31>(v); +} + +HWY_API Vec512 BroadcastSignBit(const Vec512 v) { + return Vec512{_mm512_srai_epi64(v.raw, 63)}; +} + +// ------------------------------ Floating-point classification (Not) + +HWY_API Mask512 IsNaN(const Vec512 v) { + return Mask512{_mm512_fpclass_ps_mask(v.raw, 0x81)}; +} +HWY_API Mask512 IsNaN(const Vec512 v) { + return Mask512{_mm512_fpclass_pd_mask(v.raw, 0x81)}; +} + +HWY_API Mask512 IsInf(const Vec512 v) { + return Mask512{_mm512_fpclass_ps_mask(v.raw, 0x18)}; +} +HWY_API Mask512 IsInf(const Vec512 v) { + return Mask512{_mm512_fpclass_pd_mask(v.raw, 0x18)}; +} + +// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for +// positive, so we have to check for inf/NaN and negate. +HWY_API Mask512 IsFinite(const Vec512 v) { + return Not(Mask512{_mm512_fpclass_ps_mask(v.raw, 0x99)}); +} +HWY_API Mask512 IsFinite(const Vec512 v) { + return Not(Mask512{_mm512_fpclass_pd_mask(v.raw, 0x99)}); +} + +// ================================================== MEMORY + +// ------------------------------ Load + +template +HWY_API Vec512 Load(Full512 /* tag */, const T* HWY_RESTRICT aligned) { + return Vec512{_mm512_load_si512(aligned)}; +} +HWY_API Vec512 Load(Full512 /* tag */, + const float* HWY_RESTRICT aligned) { + return Vec512{_mm512_load_ps(aligned)}; +} +HWY_API Vec512 Load(Full512 /* tag */, + const double* HWY_RESTRICT aligned) { + return Vec512{_mm512_load_pd(aligned)}; +} + +template +HWY_API Vec512 LoadU(Full512 /* tag */, const T* HWY_RESTRICT p) { + return Vec512{_mm512_loadu_si512(p)}; +} +HWY_API Vec512 LoadU(Full512 /* tag */, + const float* HWY_RESTRICT p) { + return Vec512{_mm512_loadu_ps(p)}; +} +HWY_API Vec512 LoadU(Full512 /* tag */, + const double* HWY_RESTRICT p) { + return Vec512{_mm512_loadu_pd(p)}; +} + +// ------------------------------ MaskedLoad + +template +HWY_API Vec512 MaskedLoad(Mask512 m, Full512 /* tag */, + const T* HWY_RESTRICT p) { + return Vec512{_mm512_maskz_loadu_epi8(m.raw, p)}; +} + +template +HWY_API Vec512 MaskedLoad(Mask512 m, Full512 /* tag */, + const T* HWY_RESTRICT p) { + return Vec512{_mm512_maskz_loadu_epi16(m.raw, p)}; +} + +template +HWY_API Vec512 MaskedLoad(Mask512 m, Full512 /* tag */, + const T* HWY_RESTRICT p) { + return Vec512{_mm512_maskz_loadu_epi32(m.raw, p)}; +} + +template +HWY_API Vec512 MaskedLoad(Mask512 m, Full512 /* tag */, + const T* HWY_RESTRICT p) { + return Vec512{_mm512_maskz_loadu_epi64(m.raw, p)}; +} + +HWY_API Vec512 MaskedLoad(Mask512 m, Full512 /* tag */, + const float* HWY_RESTRICT p) { + return Vec512{_mm512_maskz_loadu_ps(m.raw, p)}; +} + +HWY_API Vec512 MaskedLoad(Mask512 m, Full512 /* tag */, + const double* HWY_RESTRICT p) { + return Vec512{_mm512_maskz_loadu_pd(m.raw, p)}; +} + +// ------------------------------ LoadDup128 + +// Loads 128 bit and duplicates into both 128-bit halves. This avoids the +// 3-cycle cost of moving data between 128-bit halves and avoids port 5. +template +HWY_API Vec512 LoadDup128(Full512 /* tag */, + const T* const HWY_RESTRICT p) { + const auto x4 = LoadU(Full128(), p); + return Vec512{_mm512_broadcast_i32x4(x4.raw)}; +} +HWY_API Vec512 LoadDup128(Full512 /* tag */, + const float* const HWY_RESTRICT p) { + const __m128 x4 = _mm_loadu_ps(p); + return Vec512{_mm512_broadcast_f32x4(x4)}; +} + +HWY_API Vec512 LoadDup128(Full512 /* tag */, + const double* const HWY_RESTRICT p) { + const __m128d x2 = _mm_loadu_pd(p); + return Vec512{_mm512_broadcast_f64x2(x2)}; +} + +// ------------------------------ Store + +template +HWY_API void Store(const Vec512 v, Full512 /* tag */, + T* HWY_RESTRICT aligned) { + _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw); +} +HWY_API void Store(const Vec512 v, Full512 /* tag */, + float* HWY_RESTRICT aligned) { + _mm512_store_ps(aligned, v.raw); +} +HWY_API void Store(const Vec512 v, Full512 /* tag */, + double* HWY_RESTRICT aligned) { + _mm512_store_pd(aligned, v.raw); +} + +template +HWY_API void StoreU(const Vec512 v, Full512 /* tag */, + T* HWY_RESTRICT p) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw); +} +HWY_API void StoreU(const Vec512 v, Full512 /* tag */, + float* HWY_RESTRICT p) { + _mm512_storeu_ps(p, v.raw); +} +HWY_API void StoreU(const Vec512 v, Full512, + double* HWY_RESTRICT p) { + _mm512_storeu_pd(p, v.raw); +} + +// ------------------------------ BlendedStore + +template +HWY_API void BlendedStore(Vec512 v, Mask512 m, Full512 /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi8(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec512 v, Mask512 m, Full512 /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi16(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec512 v, Mask512 m, Full512 /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi32(p, m.raw, v.raw); +} + +template +HWY_API void BlendedStore(Vec512 v, Mask512 m, Full512 /* tag */, + T* HWY_RESTRICT p) { + _mm512_mask_storeu_epi64(p, m.raw, v.raw); +} + +HWY_API void BlendedStore(Vec512 v, Mask512 m, + Full512 /* tag */, float* HWY_RESTRICT p) { + _mm512_mask_storeu_ps(p, m.raw, v.raw); +} + +HWY_API void BlendedStore(Vec512 v, Mask512 m, + Full512 /* tag */, double* HWY_RESTRICT p) { + _mm512_mask_storeu_pd(p, m.raw, v.raw); +} + +// ------------------------------ Non-temporal stores + +template +HWY_API void Stream(const Vec512 v, Full512 /* tag */, + T* HWY_RESTRICT aligned) { + _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw); +} +HWY_API void Stream(const Vec512 v, Full512 /* tag */, + float* HWY_RESTRICT aligned) { + _mm512_stream_ps(aligned, v.raw); +} +HWY_API void Stream(const Vec512 v, Full512, + double* HWY_RESTRICT aligned) { + _mm512_stream_pd(aligned, v.raw); +} + +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +namespace detail { + +template +HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1); +} +template +HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i32scatter_epi32(base, index.raw, v.raw, 4); +} + +template +HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1); +} +template +HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i64scatter_epi64(base, index.raw, v.raw, 8); +} + +} // namespace detail + +template +HWY_API void ScatterOffset(Vec512 v, Full512 d, T* HWY_RESTRICT base, + const Vec512 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); +} +template +HWY_API void ScatterIndex(Vec512 v, Full512 d, T* HWY_RESTRICT base, + const Vec512 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); +} + +HWY_API void ScatterOffset(Vec512 v, Full512 /* tag */, + float* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i32scatter_ps(base, offset.raw, v.raw, 1); +} +HWY_API void ScatterIndex(Vec512 v, Full512 /* tag */, + float* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i32scatter_ps(base, index.raw, v.raw, 4); +} + +HWY_API void ScatterOffset(Vec512 v, Full512 /* tag */, + double* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i64scatter_pd(base, offset.raw, v.raw, 1); +} +HWY_API void ScatterIndex(Vec512 v, Full512 /* tag */, + double* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i64scatter_pd(base, index.raw, v.raw, 8); +} + +// ------------------------------ Gather + +namespace detail { + +template +HWY_INLINE Vec512 GatherOffset(hwy::SizeTag<4> /* tag */, + Full512 /* tag */, + const T* HWY_RESTRICT base, + const Vec512 offset) { + return Vec512{_mm512_i32gather_epi32(offset.raw, base, 1)}; +} +template +HWY_INLINE Vec512 GatherIndex(hwy::SizeTag<4> /* tag */, + Full512 /* tag */, + const T* HWY_RESTRICT base, + const Vec512 index) { + return Vec512{_mm512_i32gather_epi32(index.raw, base, 4)}; +} + +template +HWY_INLINE Vec512 GatherOffset(hwy::SizeTag<8> /* tag */, + Full512 /* tag */, + const T* HWY_RESTRICT base, + const Vec512 offset) { + return Vec512{_mm512_i64gather_epi64(offset.raw, base, 1)}; +} +template +HWY_INLINE Vec512 GatherIndex(hwy::SizeTag<8> /* tag */, + Full512 /* tag */, + const T* HWY_RESTRICT base, + const Vec512 index) { + return Vec512{_mm512_i64gather_epi64(index.raw, base, 8)}; +} + +} // namespace detail + +template +HWY_API Vec512 GatherOffset(Full512 d, const T* HWY_RESTRICT base, + const Vec512 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::GatherOffset(hwy::SizeTag(), d, base, offset); +} +template +HWY_API Vec512 GatherIndex(Full512 d, const T* HWY_RESTRICT base, + const Vec512 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::GatherIndex(hwy::SizeTag(), d, base, index); +} + +HWY_API Vec512 GatherOffset(Full512 /* tag */, + const float* HWY_RESTRICT base, + const Vec512 offset) { + return Vec512{_mm512_i32gather_ps(offset.raw, base, 1)}; +} +HWY_API Vec512 GatherIndex(Full512 /* tag */, + const float* HWY_RESTRICT base, + const Vec512 index) { + return Vec512{_mm512_i32gather_ps(index.raw, base, 4)}; +} + +HWY_API Vec512 GatherOffset(Full512 /* tag */, + const double* HWY_RESTRICT base, + const Vec512 offset) { + return Vec512{_mm512_i64gather_pd(offset.raw, base, 1)}; +} +HWY_API Vec512 GatherIndex(Full512 /* tag */, + const double* HWY_RESTRICT base, + const Vec512 index) { + return Vec512{_mm512_i64gather_pd(index.raw, base, 8)}; +} + +HWY_DIAGNOSTICS(pop) + +// ================================================== SWIZZLE + +// ------------------------------ LowerHalf + +template +HWY_API Vec256 LowerHalf(Full256 /* tag */, Vec512 v) { + return Vec256{_mm512_castsi512_si256(v.raw)}; +} +HWY_API Vec256 LowerHalf(Full256 /* tag */, Vec512 v) { + return Vec256{_mm512_castps512_ps256(v.raw)}; +} +HWY_API Vec256 LowerHalf(Full256 /* tag */, Vec512 v) { + return Vec256{_mm512_castpd512_pd256(v.raw)}; +} + +template +HWY_API Vec256 LowerHalf(Vec512 v) { + return LowerHalf(Full256(), v); +} + +// ------------------------------ UpperHalf + +template +HWY_API Vec256 UpperHalf(Full256 /* tag */, Vec512 v) { + return Vec256{_mm512_extracti32x8_epi32(v.raw, 1)}; +} +HWY_API Vec256 UpperHalf(Full256 /* tag */, Vec512 v) { + return Vec256{_mm512_extractf32x8_ps(v.raw, 1)}; +} +HWY_API Vec256 UpperHalf(Full256 /* tag */, Vec512 v) { + return Vec256{_mm512_extractf64x4_pd(v.raw, 1)}; +} + +// ------------------------------ ExtractLane (Store) +template +HWY_API T ExtractLane(const Vec512 v, size_t i) { + const Full512 d; + HWY_DASSERT(i < Lanes(d)); + alignas(64) T lanes[64 / sizeof(T)]; + Store(v, d, lanes); + return lanes[i]; +} + +// ------------------------------ InsertLane (Store) +template +HWY_API Vec512 InsertLane(const Vec512 v, size_t i, T t) { + const Full512 d; + HWY_DASSERT(i < Lanes(d)); + alignas(64) T lanes[64 / sizeof(T)]; + Store(v, d, lanes); + lanes[i] = t; + return Load(d, lanes); +} + +// ------------------------------ GetLane (LowerHalf) +template +HWY_API T GetLane(const Vec512 v) { + return GetLane(LowerHalf(v)); +} + +// ------------------------------ ZeroExtendVector + +template +HWY_API Vec512 ZeroExtendVector(Full512 /* tag */, Vec256 lo) { +#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h. + return Vec512{_mm512_zextsi256_si512(lo.raw)}; +#else + return Vec512{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)}; +#endif +} +HWY_API Vec512 ZeroExtendVector(Full512 /* tag */, + Vec256 lo) { +#if HWY_HAVE_ZEXT + return Vec512{_mm512_zextps256_ps512(lo.raw)}; +#else + return Vec512{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)}; +#endif +} +HWY_API Vec512 ZeroExtendVector(Full512 /* tag */, + Vec256 lo) { +#if HWY_HAVE_ZEXT + return Vec512{_mm512_zextpd256_pd512(lo.raw)}; +#else + return Vec512{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)}; +#endif +} + +// ------------------------------ Combine + +template +HWY_API Vec512 Combine(Full512 d, Vec256 hi, Vec256 lo) { + const auto lo512 = ZeroExtendVector(d, lo); + return Vec512{_mm512_inserti32x8(lo512.raw, hi.raw, 1)}; +} +HWY_API Vec512 Combine(Full512 d, Vec256 hi, + Vec256 lo) { + const auto lo512 = ZeroExtendVector(d, lo); + return Vec512{_mm512_insertf32x8(lo512.raw, hi.raw, 1)}; +} +HWY_API Vec512 Combine(Full512 d, Vec256 hi, + Vec256 lo) { + const auto lo512 = ZeroExtendVector(d, lo); + return Vec512{_mm512_insertf64x4(lo512.raw, hi.raw, 1)}; +} + +// ------------------------------ ShiftLeftBytes + +template +HWY_API Vec512 ShiftLeftBytes(Full512 /* tag */, const Vec512 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + return Vec512{_mm512_bslli_epi128(v.raw, kBytes)}; +} + +template +HWY_API Vec512 ShiftLeftBytes(const Vec512 v) { + return ShiftLeftBytes(Full512(), v); +} + +// ------------------------------ ShiftLeftLanes + +template +HWY_API Vec512 ShiftLeftLanes(Full512 d, const Vec512 v) { + const Repartition d8; + return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); +} + +template +HWY_API Vec512 ShiftLeftLanes(const Vec512 v) { + return ShiftLeftLanes(Full512(), v); +} + +// ------------------------------ ShiftRightBytes +template +HWY_API Vec512 ShiftRightBytes(Full512 /* tag */, const Vec512 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); + return Vec512{_mm512_bsrli_epi128(v.raw, kBytes)}; +} + +// ------------------------------ ShiftRightLanes +template +HWY_API Vec512 ShiftRightLanes(Full512 d, const Vec512 v) { + const Repartition d8; + return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); +} + +// ------------------------------ CombineShiftRightBytes + +template > +HWY_API V CombineShiftRightBytes(Full512 d, V hi, V lo) { + const Repartition d8; + return BitCast(d, Vec512{_mm512_alignr_epi8( + BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); +} + +// ------------------------------ Broadcast/splat any lane + +// Unsigned +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec512{_mm512_unpacklo_epi64(lo, lo)}; + } else { + const __m512i hi = + _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec512{_mm512_unpackhi_epi64(hi, hi)}; + } +} +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); + return Vec512{_mm512_shuffle_epi32(v.raw, perm)}; +} +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA; + return Vec512{_mm512_shuffle_epi32(v.raw, perm)}; +} + +// Signed +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 8, "Invalid lane"); + if (kLane < 4) { + const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); + return Vec512{_mm512_unpacklo_epi64(lo, lo)}; + } else { + const __m512i hi = + _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); + return Vec512{_mm512_unpackhi_epi64(hi, hi)}; + } +} +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); + return Vec512{_mm512_shuffle_epi32(v.raw, perm)}; +} +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA; + return Vec512{_mm512_shuffle_epi32(v.raw, perm)}; +} + +// Float +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 4, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, perm)}; +} +template +HWY_API Vec512 Broadcast(const Vec512 v) { + static_assert(0 <= kLane && kLane < 2, "Invalid lane"); + constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane); + return Vec512{_mm512_shuffle_pd(v.raw, v.raw, perm)}; +} + +// ------------------------------ Hard-coded shuffles + +// Notation: let Vec512 have lanes 7,6,5,4,3,2,1,0 (0 is +// least-significant). Shuffle0321 rotates four-lane blocks one lane to the +// right (the previous least-significant lane is now most-significant => +// 47650321). These could also be implemented via CombineShiftRightBytes but +// the shuffle_abcd notation is more convenient. + +// Swap 32-bit halves in 64-bit halves. +template +HWY_API Vec512 Shuffle2301(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)}; +} +HWY_API Vec512 Shuffle2301(const Vec512 v) { + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +namespace detail { + +template +HWY_API Vec512 Shuffle2301(const Vec512 a, const Vec512 b) { + const Full512 d; + const RebindToFloat df; + return BitCast( + d, Vec512{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, + _MM_PERM_CDAB)}); +} +template +HWY_API Vec512 Shuffle1230(const Vec512 a, const Vec512 b) { + const Full512 d; + const RebindToFloat df; + return BitCast( + d, Vec512{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, + _MM_PERM_BCDA)}); +} +template +HWY_API Vec512 Shuffle3012(const Vec512 a, const Vec512 b) { + const Full512 d; + const RebindToFloat df; + return BitCast( + d, Vec512{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, + _MM_PERM_DABC)}); +} + +} // namespace detail + +// Swap 64-bit halves +HWY_API Vec512 Shuffle1032(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512 Shuffle1032(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512 Shuffle1032(const Vec512 v) { + // Shorter encoding than _mm512_permute_ps. + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512 Shuffle01(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512 Shuffle01(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512 Shuffle01(const Vec512 v) { + // Shorter encoding than _mm512_permute_pd. + return Vec512{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)}; +} + +// Rotate right 32 bits +HWY_API Vec512 Shuffle0321(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)}; +} +HWY_API Vec512 Shuffle0321(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)}; +} +HWY_API Vec512 Shuffle0321(const Vec512 v) { + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)}; +} +// Rotate left 32 bits +HWY_API Vec512 Shuffle2103(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)}; +} +HWY_API Vec512 Shuffle2103(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)}; +} +HWY_API Vec512 Shuffle2103(const Vec512 v) { + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)}; +} + +// Reverse +HWY_API Vec512 Shuffle0123(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)}; +} +HWY_API Vec512 Shuffle0123(const Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)}; +} +HWY_API Vec512 Shuffle0123(const Vec512 v) { + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)}; +} + +// ------------------------------ TableLookupLanes + +// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. +template +struct Indices512 { + __m512i raw; +}; + +template +HWY_API Indices512 IndicesFromVec(Full512 /* tag */, Vec512 vec) { + static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); +#if HWY_IS_DEBUG_BUILD + const Full512 di; + HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && + AllTrue(di, Lt(vec, Set(di, static_cast(64 / sizeof(T)))))); +#endif + return Indices512{vec.raw}; +} + +template +HWY_API Indices512 SetTableIndices(const Full512 d, const TI* idx) { + const Rebind di; + return IndicesFromVec(d, LoadU(di, idx)); +} + +template +HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { + return Vec512{_mm512_permutexvar_epi32(idx.raw, v.raw)}; +} + +template +HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { + return Vec512{_mm512_permutexvar_epi64(idx.raw, v.raw)}; +} + +HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { + return Vec512{_mm512_permutexvar_ps(idx.raw, v.raw)}; +} + +HWY_API Vec512 TableLookupLanes(Vec512 v, + Indices512 idx) { + return Vec512{_mm512_permutexvar_pd(idx.raw, v.raw)}; +} + +// ------------------------------ Reverse + +template +HWY_API Vec512 Reverse(Full512 d, const Vec512 v) { + const RebindToSigned di; + alignas(64) constexpr int16_t kReverse[32] = { + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + const Vec512 idx = Load(di, kReverse); + return BitCast(d, Vec512{ + _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +} + +template +HWY_API Vec512 Reverse(Full512 d, const Vec512 v) { + alignas(64) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +template +HWY_API Vec512 Reverse(Full512 d, const Vec512 v) { + alignas(64) constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + return TableLookupLanes(v, SetTableIndices(d, kReverse)); +} + +// ------------------------------ Reverse2 + +template +HWY_API Vec512 Reverse2(Full512 d, const Vec512 v) { + const Full512 du32; + return BitCast(d, RotateRight<16>(BitCast(du32, v))); +} + +template +HWY_API Vec512 Reverse2(Full512 /* tag */, const Vec512 v) { + return Shuffle2301(v); +} + +template +HWY_API Vec512 Reverse2(Full512 /* tag */, const Vec512 v) { + return Shuffle01(v); +} + +// ------------------------------ Reverse4 + +template +HWY_API Vec512 Reverse4(Full512 d, const Vec512 v) { + const RebindToSigned di; + alignas(64) constexpr int16_t kReverse4[32] = { + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; + const Vec512 idx = Load(di, kReverse4); + return BitCast(d, Vec512{ + _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +} + +template +HWY_API Vec512 Reverse4(Full512 /* tag */, const Vec512 v) { + return Shuffle0123(v); +} + +template +HWY_API Vec512 Reverse4(Full512 /* tag */, const Vec512 v) { + return Vec512{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; +} +HWY_API Vec512 Reverse4(Full512 /* tag */, Vec512 v) { + return Vec512{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; +} + +// ------------------------------ Reverse8 + +template +HWY_API Vec512 Reverse8(Full512 d, const Vec512 v) { + const RebindToSigned di; + alignas(64) constexpr int16_t kReverse8[32] = { + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, + 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; + const Vec512 idx = Load(di, kReverse8); + return BitCast(d, Vec512{ + _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); +} + +template +HWY_API Vec512 Reverse8(Full512 d, const Vec512 v) { + const RebindToSigned di; + alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8}; + const Vec512 idx = Load(di, kReverse8); + return BitCast(d, Vec512{ + _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)}); +} + +template +HWY_API Vec512 Reverse8(Full512 d, const Vec512 v) { + return Reverse(d, v); +} + +// ------------------------------ InterleaveLower + +// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +// the least-significant lane) and "b". To concatenate two half-width integers +// into one, use ZipLower/Upper instead (also works with scalar). + +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_ps(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveLower(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpacklo_pd(a.raw, b.raw)}; +} + +// ------------------------------ InterleaveUpper + +// All functions inside detail lack the required D parameter. +namespace detail { + +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi8(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi16(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi32(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_epi64(a.raw, b.raw)}; +} + +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_ps(a.raw, b.raw)}; +} +HWY_API Vec512 InterleaveUpper(const Vec512 a, + const Vec512 b) { + return Vec512{_mm512_unpackhi_pd(a.raw, b.raw)}; +} + +} // namespace detail + +template > +HWY_API V InterleaveUpper(Full512 /* tag */, V a, V b) { + return detail::InterleaveUpper(a, b); +} + +// ------------------------------ ZipLower/ZipUpper (InterleaveLower) + +// Same as Interleave*, except that the return lanes are double-width integers; +// this is necessary because the single-lane scalar cannot return two values. +template > +HWY_API Vec512 ZipLower(Vec512 a, Vec512 b) { + return BitCast(Full512(), InterleaveLower(a, b)); +} +template > +HWY_API Vec512 ZipLower(Full512 /* d */, Vec512 a, Vec512 b) { + return BitCast(Full512(), InterleaveLower(a, b)); +} + +template > +HWY_API Vec512 ZipUpper(Full512 d, Vec512 a, Vec512 b) { + return BitCast(Full512(), InterleaveUpper(d, a, b)); +} + +// ------------------------------ Concat* halves + +// hiH,hiL loH,loL |-> hiL,loL (= lower halves) +template +HWY_API Vec512 ConcatLowerLower(Full512 /* tag */, const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)}; +} +HWY_API Vec512 ConcatLowerLower(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)}; +} +HWY_API Vec512 ConcatLowerLower(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)}; +} + +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template +HWY_API Vec512 ConcatUpperUpper(Full512 /* tag */, const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)}; +} +HWY_API Vec512 ConcatUpperUpper(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)}; +} +HWY_API Vec512 ConcatUpperUpper(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)}; +} + +// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) +template +HWY_API Vec512 ConcatLowerUpper(Full512 /* tag */, const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512 ConcatLowerUpper(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; +} +HWY_API Vec512 ConcatLowerUpper(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)}; +} + +// hiH,hiL loH,loL |-> hiH,loL (= outer halves) +template +HWY_API Vec512 ConcatUpperLower(Full512 /* tag */, const Vec512 hi, + const Vec512 lo) { + // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks + // are efficiently loaded from 32-bit regs. + const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF); + return Vec512{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)}; +} +HWY_API Vec512 ConcatUpperLower(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF); + return Vec512{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)}; +} +HWY_API Vec512 ConcatUpperLower(Full512 /* tag */, + const Vec512 hi, + const Vec512 lo) { + const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F); + return Vec512{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)}; +} + +// ------------------------------ ConcatOdd + +template +HWY_API Vec512 ConcatOdd(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; +#if HWY_TARGET == HWY_AVX3_DL + alignas(64) constexpr uint8_t kIdx[64] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, + 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, + 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, + 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, + 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127}; + return BitCast(d, + Vec512{_mm512_mask2_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)}); +#else + const RepartitionToWide dw; + // Right-shift 8 bits per u16 so we can pack. + const Vec512 uH = ShiftRight<8>(BitCast(dw, hi)); + const Vec512 uL = ShiftRight<8>(BitCast(dw, lo)); + const Vec512 u8{_mm512_packus_epi16(uL.raw, uH.raw)}; + // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes. + const Full512 du64; + alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx))); +#endif +} + +template +HWY_API Vec512 ConcatOdd(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint16_t kIdx[32] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}; + return BitCast(d, Vec512{_mm512_mask2_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)}); +} + +template +HWY_API Vec512 ConcatOdd(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + return BitCast(d, Vec512{_mm512_mask2_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask16{0xFFFF}, BitCast(du, hi).raw)}); +} + +HWY_API Vec512 ConcatOdd(Full512 d, Vec512 hi, + Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + return Vec512{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw, + __mmask16{0xFFFF}, hi.raw)}; +} + +template +HWY_API Vec512 ConcatOdd(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return BitCast(d, Vec512{_mm512_mask2_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF}, + BitCast(du, hi).raw)}); +} + +HWY_API Vec512 ConcatOdd(Full512 d, Vec512 hi, + Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + return Vec512{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw, + __mmask8{0xFF}, hi.raw)}; +} + +// ------------------------------ ConcatEven + +template +HWY_API Vec512 ConcatEven(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; +#if HWY_TARGET == HWY_AVX3_DL + alignas(64) constexpr uint8_t kIdx[64] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, + 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, + 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, + 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126}; + return BitCast(d, + Vec512{_mm512_mask2_permutex2var_epi8( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)}); +#else + const RepartitionToWide dw; + // Isolate lower 8 bits per u16 so we can pack. + const Vec512 mask = Set(dw, 0x00FF); + const Vec512 uH = And(BitCast(dw, hi), mask); + const Vec512 uL = And(BitCast(dw, lo), mask); + const Vec512 u8{_mm512_packus_epi16(uL.raw, uH.raw)}; + // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes. + const Full512 du64; + alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx))); +#endif +} + +template +HWY_API Vec512 ConcatEven(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint16_t kIdx[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; + return BitCast(d, Vec512{_mm512_mask2_permutex2var_epi16( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)}); +} + +template +HWY_API Vec512 ConcatEven(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}; + return BitCast(d, Vec512{_mm512_mask2_permutex2var_epi32( + BitCast(du, lo).raw, Load(du, kIdx).raw, + __mmask16{0xFFFF}, BitCast(du, hi).raw)}); +} + +HWY_API Vec512 ConcatEven(Full512 d, Vec512 hi, + Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}; + return Vec512{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw, + __mmask16{0xFFFF}, hi.raw)}; +} + +template +HWY_API Vec512 ConcatEven(Full512 d, Vec512 hi, Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return BitCast(d, Vec512{_mm512_mask2_permutex2var_epi64( + BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF}, + BitCast(du, hi).raw)}); +} + +HWY_API Vec512 ConcatEven(Full512 d, Vec512 hi, + Vec512 lo) { + const RebindToUnsigned du; + alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return Vec512{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw, + __mmask8{0xFF}, hi.raw)}; +} + +// ------------------------------ DupEven (InterleaveLower) + +template +HWY_API Vec512 DupEven(Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)}; +} +HWY_API Vec512 DupEven(Vec512 v) { + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)}; +} + +template +HWY_API Vec512 DupEven(const Vec512 v) { + return InterleaveLower(Full512(), v, v); +} + +// ------------------------------ DupOdd (InterleaveUpper) + +template +HWY_API Vec512 DupOdd(Vec512 v) { + return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)}; +} +HWY_API Vec512 DupOdd(Vec512 v) { + return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)}; +} + +template +HWY_API Vec512 DupOdd(const Vec512 v) { + return InterleaveUpper(Full512(), v, v); +} + +// ------------------------------ OddEven + +template +HWY_API Vec512 OddEven(const Vec512 a, const Vec512 b) { + constexpr size_t s = sizeof(T); + constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56; + return IfThenElse(Mask512{0x5555555555555555ull >> shift}, b, a); +} + +// ------------------------------ OddEvenBlocks + +template +HWY_API Vec512 OddEvenBlocks(Vec512 odd, Vec512 even) { + return Vec512{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)}; +} + +HWY_API Vec512 OddEvenBlocks(Vec512 odd, Vec512 even) { + return Vec512{ + _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)}; +} + +HWY_API Vec512 OddEvenBlocks(Vec512 odd, Vec512 even) { + return Vec512{ + _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)}; +} + +// ------------------------------ SwapAdjacentBlocks + +template +HWY_API Vec512 SwapAdjacentBlocks(Vec512 v) { + return Vec512{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +HWY_API Vec512 SwapAdjacentBlocks(Vec512 v) { + return Vec512{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +HWY_API Vec512 SwapAdjacentBlocks(Vec512 v) { + return Vec512{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)}; +} + +// ------------------------------ ReverseBlocks + +template +HWY_API Vec512 ReverseBlocks(Full512 /* tag */, Vec512 v) { + return Vec512{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)}; +} +HWY_API Vec512 ReverseBlocks(Full512 /* tag */, Vec512 v) { + return Vec512{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)}; +} +HWY_API Vec512 ReverseBlocks(Full512 /* tag */, + Vec512 v) { + return Vec512{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)}; +} + +// ------------------------------ TableLookupBytes (ZeroExtendVector) + +// Both full +template +HWY_API Vec512 TableLookupBytes(Vec512 bytes, Vec512 indices) { + return Vec512{_mm512_shuffle_epi8(bytes.raw, indices.raw)}; +} + +// Partial index vector +template +HWY_API Vec128 TableLookupBytes(Vec512 bytes, Vec128 from) { + const Full512 d512; + const Half d256; + const Half d128; + // First expand to full 128, then 256, then 512. + const Vec128 from_full{from.raw}; + const auto from_512 = + ZeroExtendVector(d512, ZeroExtendVector(d256, from_full)); + const auto tbl_full = TableLookupBytes(bytes, from_512); + // Shrink to 256, then 128, then partial. + return Vec128{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw}; +} +template +HWY_API Vec256 TableLookupBytes(Vec512 bytes, Vec256 from) { + const auto from_512 = ZeroExtendVector(Full512(), from); + return LowerHalf(Full256(), TableLookupBytes(bytes, from_512)); +} + +// Partial table vector +template +HWY_API Vec512 TableLookupBytes(Vec128 bytes, Vec512 from) { + const Full512 d512; + const Half d256; + const Half d128; + // First expand to full 128, then 256, then 512. + const Vec128 bytes_full{bytes.raw}; + const auto bytes_512 = + ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full)); + return TableLookupBytes(bytes_512, from); +} +template +HWY_API Vec512 TableLookupBytes(Vec256 bytes, Vec512 from) { + const auto bytes_512 = ZeroExtendVector(Full512(), bytes); + return TableLookupBytes(bytes_512, from); +} + +// Partial both are handled by x86_128/256. + +// ================================================== CONVERT + +// ------------------------------ Promotions (part w/ narrow lanes -> full) + +// Unsigned: zero-extend. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then Zip* would be faster. +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepu8_epi16(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec128 v) { + return Vec512{_mm512_cvtepu8_epi32(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepu8_epi16(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec128 v) { + return Vec512{_mm512_cvtepu8_epi32(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepu16_epi32(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepu16_epi32(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepu32_epi64(v.raw)}; +} + +// Signed: replicate sign bit. +// Note: these have 3 cycle latency; if inputs are already split across the +// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by +// signed shift would be faster. +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepi8_epi16(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec128 v) { + return Vec512{_mm512_cvtepi8_epi32(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepi16_epi32(v.raw)}; +} +HWY_API Vec512 PromoteTo(Full512 /* tag */, + Vec256 v) { + return Vec512{_mm512_cvtepi32_epi64(v.raw)}; +} + +// Float +HWY_API Vec512 PromoteTo(Full512 /* tag */, + const Vec256 v) { + return Vec512{_mm512_cvtph_ps(v.raw)}; +} + +HWY_API Vec512 PromoteTo(Full512 df32, + const Vec256 v) { + const Rebind du16; + const RebindToSigned di32; + return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); +} + +HWY_API Vec512 PromoteTo(Full512 /* tag */, Vec256 v) { + return Vec512{_mm512_cvtps_pd(v.raw)}; +} + +HWY_API Vec512 PromoteTo(Full512 /* tag */, Vec256 v) { + return Vec512{_mm512_cvtepi32_pd(v.raw)}; +} + +// ------------------------------ Demotions (full -> part w/ narrow lanes) + +HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { + const Vec512 u16{_mm512_packus_epi32(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(Full512(), kLanes); + const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)}; + return LowerHalf(even); +} + +HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { + const Vec512 i16{_mm512_packs_epi32(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(Full512(), kLanes); + const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)}; + return LowerHalf(even); +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec512 v) { + const Vec512 u16{_mm512_packus_epi32(v.raw, v.raw)}; + // packus treats the input as signed; we want unsigned. Clear the MSB to get + // unsigned saturation to u8. + const Vec512 i16{ + _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))}; + const Vec512 u8{_mm512_packus_epi16(i16.raw, i16.raw)}; + + alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12}; + const auto idx32 = LoadDup128(Full512(), kLanes); + const Vec512 fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)}; + return LowerHalf(LowerHalf(fixed)); +} + +HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { + const Vec512 u8{_mm512_packus_epi16(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(Full512(), kLanes); + const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)}; + return LowerHalf(even); +} + +HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec512 v) { + const Vec512 i16{_mm512_packs_epi32(v.raw, v.raw)}; + const Vec512 i8{_mm512_packs_epi16(i16.raw, i16.raw)}; + + alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12, + 0, 4, 8, 12, 0, 4, 8, 12}; + const auto idx32 = LoadDup128(Full512(), kLanes); + const Vec512 fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)}; + return LowerHalf(LowerHalf(fixed)); +} + +HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { + const Vec512 u8{_mm512_packs_epi16(v.raw, v.raw)}; + + // Compress even u64 lanes into 256 bit. + alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; + const auto idx64 = Load(Full512(), kLanes); + const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)}; + return LowerHalf(even); +} + +HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { + // Work around warnings in the intrinsic definitions (passing -1 as a mask). + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + return Vec256{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; + HWY_DIAGNOSTICS(pop) +} + +HWY_API Vec256 DemoteTo(Full256 dbf16, + const Vec512 v) { + // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16. + const Rebind di32; + const Rebind du32; // for logical shift right + const Rebind du16; + const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); + return BitCast(dbf16, DemoteTo(du16, bits_in_32)); +} + +HWY_API Vec512 ReorderDemote2To(Full512 dbf16, + Vec512 a, Vec512 b) { + // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16. + const RebindToUnsigned du16; + const Repartition du32; + const Vec512 b_in_even = ShiftRight<16>(BitCast(du32, b)); + return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); +} + +HWY_API Vec512 ReorderDemote2To(Full512 /*d16*/, + Vec512 a, Vec512 b) { + return Vec512{_mm512_packs_epi32(a.raw, b.raw)}; +} + +HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { + return Vec256{_mm512_cvtpd_ps(v.raw)}; +} + +HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { + const auto clamped = detail::ClampF64ToI32Max(Full512(), v); + return Vec256{_mm512_cvttpd_epi32(clamped.raw)}; +} + +// For already range-limited input [0, 255]. +HWY_API Vec128 U8FromU32(const Vec512 v) { + const Full512 d32; + // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the + // lowest 4 bytes. + alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u, + ~0u}; + const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32)); + // Gather the lowest 4 bytes of 4 128-bit blocks. + alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12}; + const Vec512 bytes{ + _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)}; + return LowerHalf(LowerHalf(bytes)); +} + +// ------------------------------ Truncations + +HWY_API Vec128 TruncateTo(Simd d, + const Vec512 v) { +#if HWY_TARGET == HWY_AVX3_DL + (void)d; + const Full512 d8; + alignas(16) static constexpr uint8_t k8From64[16] = { + 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56}; + const Vec512 bytes{ + _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)}; + return LowerHalf(LowerHalf(LowerHalf(bytes))); +#else + const Full512 d32; + alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 0, 2, 4, 6, 8, 10, 12, 14}; + const Vec512 even{ + _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)}; + return TruncateTo(d, LowerHalf(even)); +#endif +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec512 v) { + const Full512 d16; + alignas(16) static constexpr uint16_t k16From64[8] = { + 0, 4, 8, 12, 16, 20, 24, 28}; + const Vec512 bytes{ + _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)}; + return LowerHalf(LowerHalf(bytes)); +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec512 v) { + const Full512 d32; + alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 0, 2, 4, 6, 8, 10, 12, 14}; + const Vec512 even{ + _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)}; + return LowerHalf(even); +} + +HWY_API Vec128 TruncateTo(Simd /* tag */, + const Vec512 v) { +#if HWY_TARGET == HWY_AVX3_DL + const Full512 d8; + alignas(16) static constexpr uint8_t k8From32[16] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}; + const Vec512 bytes{ + _mm512_permutexvar_epi32(LoadDup128(d8, k8From32).raw, v.raw)}; +#else + const Full512 d32; + // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the + // lowest 4 bytes. + alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u, + ~0u}; + const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32)); + // Gather the lowest 4 bytes of 4 128-bit blocks. + alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12}; + const Vec512 bytes{ + _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)}; +#endif + return LowerHalf(LowerHalf(bytes)); +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec512 v) { + const Full512 d16; + alignas(64) static constexpr uint16_t k16From32[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; + const Vec512 bytes{ + _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)}; + return LowerHalf(bytes); +} + +HWY_API Vec256 TruncateTo(Simd /* tag */, + const Vec512 v) { +#if HWY_TARGET == HWY_AVX3_DL + const Full512 d8; + alignas(64) static constexpr uint8_t k8From16[64] = { + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; + const Vec512 bytes{ + _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)}; +#else + const Full512 d32; + alignas(16) static constexpr uint32_t k16From32[4] = { + 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u}; + const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32)); + alignas(64) static constexpr uint32_t kIndex32[16] = { + 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; + const Vec512 bytes{ + _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)}; +#endif + return LowerHalf(bytes); +} + +// ------------------------------ Convert integer <=> floating point + +HWY_API Vec512 ConvertTo(Full512 /* tag */, + const Vec512 v) { + return Vec512{_mm512_cvtepi32_ps(v.raw)}; +} + +HWY_API Vec512 ConvertTo(Full512 /* tag */, + const Vec512 v) { + return Vec512{_mm512_cvtepi64_pd(v.raw)}; +} + +HWY_API Vec512 ConvertTo(Full512 /* tag*/, + const Vec512 v) { + return Vec512{_mm512_cvtepu32_ps(v.raw)}; +} + +HWY_API Vec512 ConvertTo(Full512 /* tag*/, + const Vec512 v) { + return Vec512{_mm512_cvtepu64_pd(v.raw)}; +} + +// Truncates (rounds toward zero). +HWY_API Vec512 ConvertTo(Full512 d, const Vec512 v) { + return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw)); +} +HWY_API Vec512 ConvertTo(Full512 di, const Vec512 v) { + return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw)); +} + +HWY_API Vec512 NearestInt(const Vec512 v) { + const Full512 di; + return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw)); +} + +// ================================================== CRYPTO + +#if !defined(HWY_DISABLE_PCLMUL_AES) + +// Per-target flag to prevent generic_ops-inl.h from defining AESRound. +#ifdef HWY_NATIVE_AES +#undef HWY_NATIVE_AES +#else +#define HWY_NATIVE_AES +#endif + +HWY_API Vec512 AESRound(Vec512 state, + Vec512 round_key) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec512{_mm512_aesenc_epi128(state.raw, round_key.raw)}; +#else + const Full512 d; + const Half d2; + return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec512 AESLastRound(Vec512 state, + Vec512 round_key) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec512{_mm512_aesenclast_epi128(state.raw, round_key.raw)}; +#else + const Full512 d; + const Half d2; + return Combine(d, + AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), + AESLastRound(LowerHalf(state), LowerHalf(round_key))); +#endif +} + +HWY_API Vec512 CLMulLower(Vec512 va, Vec512 vb) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec512{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)}; +#else + alignas(64) uint64_t a[8]; + alignas(64) uint64_t b[8]; + const Full512 d; + const Full128 d128; + Store(va, d, a); + Store(vb, d, b); + for (size_t i = 0; i < 8; i += 2) { + const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i)); + Store(mul, d128, a + i); + } + return Load(d, a); +#endif +} + +HWY_API Vec512 CLMulUpper(Vec512 va, Vec512 vb) { +#if HWY_TARGET == HWY_AVX3_DL + return Vec512{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)}; +#else + alignas(64) uint64_t a[8]; + alignas(64) uint64_t b[8]; + const Full512 d; + const Full128 d128; + Store(va, d, a); + Store(vb, d, b); + for (size_t i = 0; i < 8; i += 2) { + const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i)); + Store(mul, d128, a + i); + } + return Load(d, a); +#endif +} + +#endif // HWY_DISABLE_PCLMUL_AES + +// ================================================== MISC + +// Returns a vector with lane i=[0, N) set to "first" + i. +template +Vec512 Iota(const Full512 d, const T2 first) { + HWY_ALIGN T lanes[64 / sizeof(T)]; + for (size_t i = 0; i < 64 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + +// ------------------------------ Mask testing + +// Beware: the suffix indicates the number of mask bits, not lane size! + +namespace detail { + +template +HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask64_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template +HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template +HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} +template +HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestz_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0; +#endif +} + +} // namespace detail + +template +HWY_API bool AllFalse(const Full512 /* tag */, const Mask512 mask) { + return detail::AllFalse(hwy::SizeTag(), mask); +} + +namespace detail { + +template +HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask64_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFFFFFFFFFFFFFull; +#endif +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask32_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFFFFFull; +#endif +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask16_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFFFull; +#endif +} +template +HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask512 mask) { +#if HWY_COMPILER_HAS_MASK_INTRINSICS + return _kortestc_mask8_u8(mask.raw, mask.raw); +#else + return mask.raw == 0xFFull; +#endif +} + +} // namespace detail + +template +HWY_API bool AllTrue(const Full512 /* tag */, const Mask512 mask) { + return detail::AllTrue(hwy::SizeTag(), mask); +} + +// `p` points to at least 8 readable bytes, not all of which need be valid. +template +HWY_API Mask512 LoadMaskBits(const Full512 /* tag */, + const uint8_t* HWY_RESTRICT bits) { + Mask512 mask; + CopyBytes<8 / sizeof(T)>(bits, &mask.raw); + // N >= 8 (= 512 / 64), so no need to mask invalid bits. + return mask; +} + +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(const Full512 /* tag */, const Mask512 mask, + uint8_t* bits) { + const size_t kNumBytes = 8 / sizeof(T); + CopyBytes(&mask.raw, bits); + // N >= 8 (= 512 / 64), so no need to mask invalid bits. + return kNumBytes; +} + +template +HWY_API size_t CountTrue(const Full512 /* tag */, const Mask512 mask) { + return PopCount(static_cast(mask.raw)); +} + +template +HWY_API size_t FindKnownFirstTrue(const Full512 /* tag */, + const Mask512 mask) { + return Num0BitsBelowLS1Bit_Nonzero32(mask.raw); +} + +template +HWY_API size_t FindKnownFirstTrue(const Full512 /* tag */, + const Mask512 mask) { + return Num0BitsBelowLS1Bit_Nonzero64(mask.raw); +} + +template +HWY_API intptr_t FindFirstTrue(const Full512 d, const Mask512 mask) { + return mask.raw ? static_cast(FindKnownFirstTrue(d, mask)) + : intptr_t{-1}; +} + +// ------------------------------ Compress + +template +HWY_API Vec512 Compress(Vec512 v, Mask512 mask) { + return Vec512{_mm512_maskz_compress_epi32(mask.raw, v.raw)}; +} + +HWY_API Vec512 Compress(Vec512 v, Mask512 mask) { + return Vec512{_mm512_maskz_compress_ps(mask.raw, v.raw)}; +} + +template +HWY_API Vec512 Compress(Vec512 v, Mask512 mask) { + // See CompressIsPartition. u64 is faster than u32. + alignas(16) constexpr uint64_t packed_array[256] = { + // From PrintCompress32x8Tables, without the FirstN extension (there is + // no benefit to including them because 64-bit CompressStore is anyway + // masked, but also no harm because TableLookupLanes ignores the MSB). + 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120, + 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310, + 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140, + 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210, + 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320, + 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510, + 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530, + 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210, + 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420, + 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310, + 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160, + 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210, + 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320, + 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410, + 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430, + 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210, + 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520, + 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310, + 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540, + 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210, + 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320, + 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710, + 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730, + 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210, + 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420, + 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310, + 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750, + 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210, + 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320, + 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410, + 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430, + 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210, + 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620, + 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310, + 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640, + 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210, + 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320, + 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510, + 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530, + 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210, + 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420, + 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310, + 0x10765432, 0x17654320, 0x07654321, 0x76543210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 3) - + // _mm512_permutexvar_epi64 will ignore the upper bits. + const Full512 d; + const RebindToUnsigned du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; + const auto indices = Indices512{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +// 16-bit may use the 32-bit Compress and must be defined after it. +// +// Ignore IDE redefinition error - this is not actually defined in x86_256 if +// we are including x86_512-inl.h. +template +HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { + const Full256 d; + const Rebind du; + const auto vu = BitCast(du, v); // (required for float16_t inputs) + +#if HWY_TARGET == HWY_AVX3_DL // VBMI2 + const Vec256 cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)}; +#else + // Promote to i32 (512-bit vector!) so we can use the native Compress. + const auto vw = PromoteTo(Rebind(), vu); + const Mask512 mask32{static_cast<__mmask16>(mask.raw)}; + const auto cu = DemoteTo(du, Compress(vw, mask32)); +#endif // HWY_TARGET == HWY_AVX3_DL + + return BitCast(d, cu); +} + +// Expands to 32-bit, compresses, concatenate demoted halves. +template +HWY_API Vec512 Compress(Vec512 v, const Mask512 mask) { + const Full512 d; + const Rebind du; + const auto vu = BitCast(du, v); // (required for float16_t inputs) + +#if HWY_TARGET == HWY_AVX3_DL // VBMI2 + const Vec512 cu{_mm512_maskz_compress_epi16(mask.raw, vu.raw)}; +#else + const Repartition dw; + const Half duh; + const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu)); + const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu)); + + const uint32_t mask_bits{mask.raw}; + const Mask512 mask0{static_cast<__mmask16>(mask_bits & 0xFFFF)}; + const Mask512 mask1{static_cast<__mmask16>(mask_bits >> 16)}; + const auto compressed0 = Compress(promoted0, mask0); + const auto compressed1 = Compress(promoted1, mask1); + + const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0)); + const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1)); + + // Concatenate into single vector by shifting upper with writemask. + const size_t num0 = CountTrue(dw, mask0); + const __mmask32 m_upper = ~((1u << num0) - 1); + alignas(64) uint16_t iota[64] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + const Vec512 idx = LoadU(du, iota + 32 - num0); + const Vec512 cu{_mm512_mask_permutexvar_epi16( + demoted0.raw, m_upper, idx.raw, demoted1.raw)}; +#endif // HWY_TARGET == HWY_AVX3_DL + + return BitCast(d, cu); +} + +// ------------------------------ CompressNot + +template +HWY_API Vec512 CompressNot(Vec512 v, const Mask512 mask) { + return Compress(v, Not(mask)); +} + +template +HWY_API Vec512 CompressNot(Vec512 v, Mask512 mask) { + // See CompressIsPartition. u64 is faster than u32. + alignas(16) constexpr uint64_t packed_array[256] = { + // From PrintCompressNot32x8Tables, without the FirstN extension (there is + // no benefit to including them because 64-bit CompressStore is anyway + // masked, but also no harm because TableLookupLanes ignores the MSB). + 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431, + 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542, + 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321, + 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653, + 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651, + 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432, + 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421, + 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764, + 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631, + 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762, + 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321, + 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543, + 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541, + 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532, + 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521, + 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075, + 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431, + 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742, + 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321, + 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073, + 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071, + 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432, + 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421, + 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654, + 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531, + 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652, + 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321, + 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643, + 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641, + 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632, + 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621, + 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106, + 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431, + 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542, + 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321, + 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053, + 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051, + 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432, + 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421, + 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104, + 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031, + 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102, + 0x76543210, 0x76543201, 0x76543210, 0x76543210}; + + // For lane i, shift the i-th 4-bit index down to bits [0, 3) - + // _mm512_permutexvar_epi64 will ignore the upper bits. + const Full512 d; + const RebindToUnsigned du64; + const auto packed = Set(du64, packed_array[mask.raw]); + alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; + const auto indices = Indices512{(packed >> Load(du64, shifts)).raw}; + return TableLookupLanes(v, indices); +} + +HWY_API Vec512 CompressBlocksNot(Vec512 v, + Mask512 mask) { + return CompressNot(v, mask); +} + +// ------------------------------ CompressBits +template +HWY_API Vec512 CompressBits(Vec512 v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(Full512(), bits)); +} + +// ------------------------------ CompressStore + +template +HWY_API size_t CompressStore(Vec512 v, Mask512 mask, Full512 d, + T* HWY_RESTRICT unaligned) { + const Rebind du; + const auto vu = BitCast(du, v); // (required for float16_t inputs) + + const uint64_t mask_bits{mask.raw}; + +#if HWY_TARGET == HWY_AVX3_DL // VBMI2 + _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw); +#else + const Repartition dw; + const Half duh; + const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu)); + const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu)); + + const uint64_t maskL = mask_bits & 0xFFFF; + const uint64_t maskH = mask_bits >> 16; + const Mask512 mask0{static_cast<__mmask16>(maskL)}; + const Mask512 mask1{static_cast<__mmask16>(maskH)}; + const auto compressed0 = Compress(promoted0, mask0); + const auto compressed1 = Compress(promoted1, mask1); + + const Half dh; + const auto demoted0 = BitCast(dh, DemoteTo(duh, compressed0)); + const auto demoted1 = BitCast(dh, DemoteTo(duh, compressed1)); + + // Store 256-bit halves + StoreU(demoted0, dh, unaligned); + StoreU(demoted1, dh, unaligned + PopCount(maskL)); +#endif + + return PopCount(mask_bits); +} + +template +HWY_API size_t CompressStore(Vec512 v, Mask512 mask, Full512 /* tag */, + T* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw}); +// Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +template +HWY_API size_t CompressStore(Vec512 v, Mask512 mask, Full512 /* tag */, + T* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw}); +// Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; +} + +HWY_API size_t CompressStore(Vec512 v, Mask512 mask, + Full512 /* tag */, + float* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw}); +// Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(float)); +#endif + return count; +} + +HWY_API size_t CompressStore(Vec512 v, Mask512 mask, + Full512 /* tag */, + double* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); + const size_t count = PopCount(uint64_t{mask.raw}); +// Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(double)); +#endif + return count; +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(Vec512 v, Mask512 m, Full512 d, + T* HWY_RESTRICT unaligned) { + // AVX-512 already does the blending at no extra cost (latency 11, + // rthroughput 2 - same as compress plus store). + if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) { + return CompressStore(v, m, d, unaligned); + } else { + const size_t count = CountTrue(d, m); + BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned); +// Workaround for MSAN not marking output as initialized (b/233326619) +#if HWY_IS_MSAN + __msan_unpoison(unaligned, count * sizeof(T)); +#endif + return count; + } +} + +// ------------------------------ CompressBitsStore +template +HWY_API size_t CompressBitsStore(Vec512 v, const uint8_t* HWY_RESTRICT bits, + Full512 d, T* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +// ------------------------------ LoadInterleaved4 + +// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4. +namespace detail { + +// Type-safe wrapper. +template <_MM_PERM_ENUM kPerm, typename T> +Vec512 Shuffle128(const Vec512 lo, const Vec512 hi) { + return Vec512{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)}; +} +template <_MM_PERM_ENUM kPerm> +Vec512 Shuffle128(const Vec512 lo, const Vec512 hi) { + return Vec512{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)}; +} +template <_MM_PERM_ENUM kPerm> +Vec512 Shuffle128(const Vec512 lo, const Vec512 hi) { + return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)}; +} + +// Input (128-bit blocks): +// 3 2 1 0 (<- first block in unaligned) +// 7 6 5 4 +// b a 9 8 +// Output: +// 9 6 3 0 (LSB of A) +// a 7 4 1 +// b 8 5 2 +template +HWY_API void LoadTransposedBlocks3(Full512 d, + const T* HWY_RESTRICT unaligned, + Vec512& A, Vec512& B, Vec512& C) { + constexpr size_t N = 64 / sizeof(T); + const Vec512 v3210 = LoadU(d, unaligned + 0 * N); + const Vec512 v7654 = LoadU(d, unaligned + 1 * N); + const Vec512 vba98 = LoadU(d, unaligned + 2 * N); + + const Vec512 v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654); + const Vec512 va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98); + + A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976); + B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976); + C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98); +} + +// Input (128-bit blocks): +// 3 2 1 0 (<- first block in unaligned) +// 7 6 5 4 +// b a 9 8 +// f e d c +// Output: +// c 8 4 0 (LSB of A) +// d 9 5 1 +// e a 6 2 +// f b 7 3 +template +HWY_API void LoadTransposedBlocks4(Full512 d, + const T* HWY_RESTRICT unaligned, + Vec512& A, Vec512& B, Vec512& C, + Vec512& D) { + constexpr size_t N = 64 / sizeof(T); + const Vec512 v3210 = LoadU(d, unaligned + 0 * N); + const Vec512 v7654 = LoadU(d, unaligned + 1 * N); + const Vec512 vba98 = LoadU(d, unaligned + 2 * N); + const Vec512 vfedc = LoadU(d, unaligned + 3 * N); + + const Vec512 v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654); + const Vec512 vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc); + const Vec512 v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654); + const Vec512 vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc); + A = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98); + B = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98); + C = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba); + D = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba); +} + +} // namespace detail + +// ------------------------------ StoreInterleaved2 + +// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. + +namespace detail { + +// Input (128-bit blocks): +// 6 4 2 0 (LSB of i) +// 7 5 3 1 +// Output: +// 3 2 1 0 +// 7 6 5 4 +template +HWY_API void StoreTransposedBlocks2(const Vec512 i, const Vec512 j, + const Full512 d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 64 / sizeof(T); + const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j); + const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j); + const auto j1_i1_j0_i0 = + detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0); + const auto j3_i3_j2_i2 = + detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2); + StoreU(j1_i1_j0_i0, d, unaligned + 0 * N); + StoreU(j3_i3_j2_i2, d, unaligned + 1 * N); +} + +// Input (128-bit blocks): +// 9 6 3 0 (LSB of i) +// a 7 4 1 +// b 8 5 2 +// Output: +// 3 2 1 0 +// 7 6 5 4 +// b a 9 8 +template +HWY_API void StoreTransposedBlocks3(const Vec512 i, const Vec512 j, + const Vec512 k, Full512 d, + T* HWY_RESTRICT unaligned) { + constexpr size_t N = 64 / sizeof(T); + const Vec512 j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j); + const Vec512 i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i); + const Vec512 j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j); + + const Vec512 out0 = // i1 k0 j0 i0 + detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0); + const Vec512 out1 = // j2 i2 k1 j1 + detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0); + const Vec512 out2 = // k3 j3 i3 k2 + detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1); + + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + StoreU(out2, d, unaligned + 2 * N); +} + +// Input (128-bit blocks): +// c 8 4 0 (LSB of i) +// d 9 5 1 +// e a 6 2 +// f b 7 3 +// Output: +// 3 2 1 0 +// 7 6 5 4 +// b a 9 8 +// f e d c +template +HWY_API void StoreTransposedBlocks4(const Vec512 i, const Vec512 j, + const Vec512 k, const Vec512 l, + Full512 d, T* HWY_RESTRICT unaligned) { + constexpr size_t N = 64 / sizeof(T); + const Vec512 j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j); + const Vec512 l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l); + const Vec512 j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j); + const Vec512 l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l); + const Vec512 out0 = + detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0); + const Vec512 out1 = + detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0); + const Vec512 out2 = + detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2); + const Vec512 out3 = + detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2); + StoreU(out0, d, unaligned + 0 * N); + StoreU(out1, d, unaligned + 1 * N); + StoreU(out2, d, unaligned + 2 * N); + StoreU(out3, d, unaligned + 3 * N); +} + +} // namespace detail + +// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower) + +HWY_INLINE Vec512 MulEven(const Vec512 a, + const Vec512 b) { + const DFromV du64; + const RepartitionToNarrow du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need the lower 32 bits + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need + // the even (lower 64 bits of every 128-bit block) results. See + // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveLower(mulL, mulH); +} + +HWY_INLINE Vec512 MulOdd(const Vec512 a, + const Vec512 b) { + const DFromV du64; + const RepartitionToNarrow du32; + const auto maskL = Set(du64, 0xFFFFFFFFULL); + const auto a32 = BitCast(du32, a); + const auto b32 = BitCast(du32, b); + // Inputs for MulEven: we only need bits [95:64] (= upper half of input) + const auto aH = Shuffle2301(a32); + const auto bH = Shuffle2301(b32); + + // Same as above, but we're using the odd results (upper 64 bits per block). + const auto aLbL = MulEven(a32, b32); + const auto w3 = aLbL & maskL; + + const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); + const auto w2 = t2 & maskL; + const auto w1 = ShiftRight<32>(t2); + + const auto t = MulEven(a32, bH) + w2; + const auto k = ShiftRight<32>(t); + + const auto mulH = MulEven(aH, bH) + w1 + k; + const auto mulL = ShiftLeft<32>(t) + w3; + return InterleaveUpper(du64, mulL, mulH); +} + +// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) + +HWY_API Vec512 ReorderWidenMulAccumulate(Full512 df32, + Vec512 a, + Vec512 b, + const Vec512 sum0, + Vec512& sum1) { + // TODO(janwas): _mm512_dpbf16_ps when available + const Repartition du16; + const RebindToUnsigned du32; + const Vec512 zero = Zero(du16); + // Lane order within sum0/1 is undefined, hence we can avoid the + // longer-latency lane-crossing PromoteTo. + const Vec512 a0 = ZipLower(du32, zero, BitCast(du16, a)); + const Vec512 a1 = ZipUpper(du32, zero, BitCast(du16, a)); + const Vec512 b0 = ZipLower(du32, zero, BitCast(du16, b)); + const Vec512 b1 = ZipUpper(du32, zero, BitCast(du16, b)); + sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); + return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); +} + +HWY_API Vec512 ReorderWidenMulAccumulate(Full512 /*d32*/, + Vec512 a, + Vec512 b, + const Vec512 sum0, + Vec512& /*sum1*/) { + return sum0 + Vec512{_mm512_madd_epi16(a.raw, b.raw)}; +} + +// ------------------------------ Reductions + +// Returns the sum in each lane. +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_add_epi32(v.raw)); +} +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_add_epi64(v.raw)); +} +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + return Set(d, static_cast(_mm512_reduce_add_epi32(v.raw))); +} +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + return Set(d, static_cast(_mm512_reduce_add_epi64(v.raw))); +} +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_add_ps(v.raw)); +} +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_add_pd(v.raw)); +} +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(d32, even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} +HWY_API Vec512 SumOfLanes(Full512 d, Vec512 v) { + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto sum = SumOfLanes(d32, even + odd); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); +} + +// Returns the minimum in each lane. +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_min_epi32(v.raw)); +} +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_min_epi64(v.raw)); +} +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_min_epu32(v.raw)); +} +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_min_epu64(v.raw)); +} +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_min_ps(v.raw)); +} +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_min_pd(v.raw)); +} +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(d32, Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +HWY_API Vec512 MinOfLanes(Full512 d, Vec512 v) { + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MinOfLanes(d32, Min(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +// Returns the maximum in each lane. +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_max_epi32(v.raw)); +} +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_max_epi64(v.raw)); +} +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_max_epu32(v.raw)); +} +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_max_epu64(v.raw)); +} +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_max_ps(v.raw)); +} +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + return Set(d, _mm512_reduce_max_pd(v.raw)); +} +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + const RepartitionToWide d32; + const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(d32, Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} +HWY_API Vec512 MaxOfLanes(Full512 d, Vec512 v) { + const RepartitionToWide d32; + // Sign-extend + const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); + const auto odd = ShiftRight<16>(BitCast(d32, v)); + const auto min = MaxOfLanes(d32, Max(even, odd)); + // Also broadcast into odd lanes. + return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - +// the warning seems to be issued at the call site of intrinsics, i.e. our code. +HWY_DIAGNOSTICS(pop) diff --git a/hwy/per_target.cc b/hwy/per_target.cc new file mode 100644 index 0000000..4cbf152 --- /dev/null +++ b/hwy/per_target.cc @@ -0,0 +1,50 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/per_target.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/per_target.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +// On SVE, Lanes rounds down to a power of two, but we want to know the actual +// size here. Otherwise, hypothetical SVE with 48 bytes would round down to 32 +// and we'd enable HWY_SVE_256, and then fail reverse_test because Reverse on +// HWY_SVE_256 requires the actual vector to be a power of two. +#if HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE_256 +size_t GetVectorBytes() { return detail::AllHardwareLanes(hwy::SizeTag<1>()); } +#else +size_t GetVectorBytes() { return Lanes(ScalableTag()); } +#endif +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE + +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(GetVectorBytes); // Local function. +} // namespace + +size_t VectorBytes() { return HWY_DYNAMIC_DISPATCH(GetVectorBytes)(); } + +} // namespace hwy +#endif // HWY_ONCE diff --git a/hwy/per_target.h b/hwy/per_target.h new file mode 100644 index 0000000..da85de3 --- /dev/null +++ b/hwy/per_target.h @@ -0,0 +1,37 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_PER_TARGET_H_ +#define HIGHWAY_HWY_PER_TARGET_H_ + +#include + +// Per-target functions. + +namespace hwy { + +// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag())`. +// +// Do not cache the result, which may change after calling DisableTargets, or +// if software requests a different vector size (e.g. when entering/exiting SME +// streaming mode). Instead call this right before the code that depends on the +// result, without any DisableTargets or SME transition in-between. Note that +// this involves an indirect call, so prefer not to call this frequently nor +// unnecessarily. +size_t VectorBytes(); + +} // namespace hwy + +#endif // HIGHWAY_HWY_PER_TARGET_H_ diff --git a/hwy/print-inl.h b/hwy/print-inl.h new file mode 100644 index 0000000..d256657 --- /dev/null +++ b/hwy/print-inl.h @@ -0,0 +1,55 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Print() function + +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/highway.h" +#include "hwy/print.h" + +// Per-target include guard +#if defined(HIGHWAY_HWY_PRINT_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_PRINT_INL_H_ +#undef HIGHWAY_HWY_PRINT_INL_H_ +#else +#define HIGHWAY_HWY_PRINT_INL_H_ +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Prints lanes around `lane`, in memory order. +template > +void Print(const D d, const char* caption, VecArg v, size_t lane_u = 0, + size_t max_lanes = 7) { + const size_t N = Lanes(d); + using T = TFromD; + auto lanes = AllocateAligned(N); + Store(v, d, lanes.get()); + + const auto info = hwy::detail::MakeTypeInfo(); + hwy::detail::PrintArray(info, caption, lanes.get(), N, lane_u, max_lanes); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // per-target include guard diff --git a/hwy/print.cc b/hwy/print.cc new file mode 100644 index 0000000..0b52cde --- /dev/null +++ b/hwy/print.cc @@ -0,0 +1,110 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/print.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include +#include + +#include "hwy/base.h" + +namespace hwy { +namespace detail { + +HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) { + const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u'); + // Omit the xN suffix for scalars. + if (N == 1) { + // NOLINTNEXTLINE + snprintf(string100, 64, "%c%d", prefix, + static_cast(info.sizeof_t * 8)); + } else { + // NOLINTNEXTLINE + snprintf(string100, 64, "%c%dx%d", prefix, + static_cast(info.sizeof_t * 8), static_cast(N)); + } +} + +HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr, + char* string100) { + if (info.sizeof_t == 1) { + uint8_t byte; + CopyBytes<1>(ptr, &byte); // endian-safe: we ensured sizeof(T)=1. + snprintf(string100, 100, "0x%02X", byte); // NOLINT + } else if (info.sizeof_t == 2) { + uint16_t bits; + CopyBytes<2>(ptr, &bits); + snprintf(string100, 100, "0x%04X", bits); // NOLINT + } else if (info.sizeof_t == 4) { + if (info.is_float) { + float value; + CopyBytes<4>(ptr, &value); + snprintf(string100, 100, "%g", static_cast(value)); // NOLINT + } else if (info.is_signed) { + int32_t value; + CopyBytes<4>(ptr, &value); + snprintf(string100, 100, "%d", value); // NOLINT + } else { + uint32_t value; + CopyBytes<4>(ptr, &value); + snprintf(string100, 100, "%u", value); // NOLINT + } + } else { + HWY_ASSERT(info.sizeof_t == 8); + if (info.is_float) { + double value; + CopyBytes<8>(ptr, &value); + snprintf(string100, 100, "%g", value); // NOLINT + } else if (info.is_signed) { + int64_t value; + CopyBytes<8>(ptr, &value); + snprintf(string100, 100, "%" PRIi64 "", value); // NOLINT + } else { + uint64_t value; + CopyBytes<8>(ptr, &value); + snprintf(string100, 100, "%" PRIu64 "", value); // NOLINT + } + } +} + +HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption, + const void* array_void, size_t N, size_t lane_u, + size_t max_lanes) { + const uint8_t* array_bytes = reinterpret_cast(array_void); + + char type_name[100]; + TypeName(info, N, type_name); + + const intptr_t lane = intptr_t(lane_u); + const size_t begin = static_cast(HWY_MAX(0, lane - 2)); + const size_t end = HWY_MIN(begin + max_lanes, N); + fprintf(stderr, "%s %s [%" PRIu64 "+ ->]:\n ", type_name, caption, + static_cast(begin)); + for (size_t i = begin; i < end; ++i) { + const void* ptr = array_bytes + i * info.sizeof_t; + char str[100]; + ToString(info, ptr, str); + fprintf(stderr, "%s,", str); + } + if (begin >= end) fprintf(stderr, "(out of bounds)"); + fprintf(stderr, "\n"); +} + +} // namespace detail +} // namespace hwy diff --git a/hwy/print.h b/hwy/print.h new file mode 100644 index 0000000..1379286 --- /dev/null +++ b/hwy/print.h @@ -0,0 +1,73 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HWY_PRINT_H_ +#define HWY_PRINT_H_ + +// Helpers for printing vector lanes. + +#include +#include + +#include "hwy/base.h" +#include "hwy/highway_export.h" + +namespace hwy { + +namespace detail { + +// For implementing value comparisons etc. as type-erased functions to reduce +// template bloat. +struct TypeInfo { + size_t sizeof_t; + bool is_float; + bool is_signed; +}; + +template +HWY_INLINE TypeInfo MakeTypeInfo() { + TypeInfo info; + info.sizeof_t = sizeof(T); + info.is_float = IsFloat(); + info.is_signed = IsSigned(); + return info; +} + +HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100); +HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr, + char* string100); + +HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption, + const void* array_void, size_t N, + size_t lane_u = 0, size_t max_lanes = 7); + +} // namespace detail + +template +HWY_NOINLINE void PrintValue(T value) { + char str[100]; + detail::ToString(hwy::detail::MakeTypeInfo(), &value, str); + fprintf(stderr, "%s,", str); +} + +template +HWY_NOINLINE void PrintArray(const T* value, size_t count) { + detail::PrintArray(hwy::detail::MakeTypeInfo(), "", value, count, 0, + count); +} + +} // namespace hwy + +#endif // HWY_PRINT_H_ diff --git a/hwy/targets.cc b/hwy/targets.cc new file mode 100644 index 0000000..2fde4db --- /dev/null +++ b/hwy/targets.cc @@ -0,0 +1,434 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/targets.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include // PRIx64 +#include +#include +#include +#include + +#include + +#include "hwy/per_target.h" // VectorBytes + +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN +#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace +#endif + +#include // abort / exit + +#if HWY_ARCH_X86 +#include +#if HWY_COMPILER_MSVC +#include +#else // !HWY_COMPILER_MSVC +#include +#endif // HWY_COMPILER_MSVC + +#elif HWY_ARCH_ARM && HWY_OS_LINUX +#include +#include +#endif // HWY_ARCH_* + +namespace hwy { +namespace { + +#if HWY_ARCH_X86 + +HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) { + return (reg & (1U << index)) != 0; +} + +// Calls CPUID instruction with eax=level and ecx=count and returns the result +// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). +HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HWY_RESTRICT abcd) { +#if HWY_COMPILER_MSVC + int regs[4]; + __cpuidex(regs, level, count); + for (int i = 0; i < 4; ++i) { + abcd[i] = regs[i]; + } +#else // HWY_COMPILER_MSVC + uint32_t a; + uint32_t b; + uint32_t c; + uint32_t d; + __cpuid_count(level, count, a, b, c, d); + abcd[0] = a; + abcd[1] = b; + abcd[2] = c; + abcd[3] = d; +#endif // HWY_COMPILER_MSVC +} + +// Returns the lower 32 bits of extended control register 0. +// Requires CPU support for "OSXSAVE" (see below). +uint32_t ReadXCR0() { +#if HWY_COMPILER_MSVC + return static_cast(_xgetbv(0)); +#else // HWY_COMPILER_MSVC + uint32_t xcr0, xcr0_high; + const uint32_t index = 0; + asm volatile(".byte 0x0F, 0x01, 0xD0" + : "=a"(xcr0), "=d"(xcr0_high) + : "c"(index)); + return xcr0; +#endif // HWY_COMPILER_MSVC +} + +#endif // HWY_ARCH_X86 + +// When running tests, this value can be set to the mocked supported targets +// mask. Only written to from a single thread before the test starts. +int64_t supported_targets_for_test_ = 0; + +// Mask of targets disabled at runtime with DisableTargets. +int64_t supported_mask_ = LimitsMax(); + +#if HWY_ARCH_X86 +// Arbritrary bit indices indicating which instruction set extensions are +// supported. Use enum to ensure values are distinct. +enum class FeatureIndex : uint32_t { + kSSE = 0, + kSSE2, + kSSE3, + kSSSE3, + + kSSE41, + kSSE42, + kCLMUL, + kAES, + + kAVX, + kAVX2, + kF16C, + kFMA, + kLZCNT, + kBMI, + kBMI2, + + kAVX512F, + kAVX512VL, + kAVX512DQ, + kAVX512BW, + + kVNNI, + kVPCLMULQDQ, + kVBMI, + kVBMI2, + kVAES, + kPOPCNTDQ, + kBITALG, + + kSentinel +}; +static_assert(static_cast(FeatureIndex::kSentinel) < 64, + "Too many bits for u64"); + +HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) { + return 1ull << static_cast(index); +} + +constexpr uint64_t kGroupSSSE3 = + Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) | + Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3); + +constexpr uint64_t kGroupSSE4 = + Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | + Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3; + +// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to +// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them +// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of +// avoiding using and requiring these so AVX2 can still be used. +#ifdef HWY_DISABLE_BMI2_FMA +constexpr uint64_t kGroupBMI2_FMA = 0; +#else +constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) | + Bit(FeatureIndex::kBMI2) | + Bit(FeatureIndex::kFMA); +#endif + +#ifdef HWY_DISABLE_F16C +constexpr uint64_t kGroupF16C = 0; +#else +constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C); +#endif + +constexpr uint64_t kGroupAVX2 = + Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) | + Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4; + +constexpr uint64_t kGroupAVX3 = + Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) | + Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2; + +constexpr uint64_t kGroupAVX3_DL = + Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) | + Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) | + Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) | + Bit(FeatureIndex::kBITALG) | kGroupAVX3; + +#endif // HWY_ARCH_X86 + +// Returns targets supported by the CPU, independently of DisableTargets. +// Factored out of SupportedTargets to make its structure more obvious. Note +// that x86 CPUID may take several hundred cycles. +int64_t DetectTargets() { + // Apps will use only one of these (the default is EMU128), but compile flags + // for this TU may differ from that of the app, so allow both. + int64_t bits = HWY_SCALAR | HWY_EMU128; + +#if HWY_ARCH_X86 + bool has_osxsave = false; + { // ensures we do not accidentally use flags outside this block + uint64_t flags = 0; + uint32_t abcd[4]; + + Cpuid(0, 0, abcd); + const uint32_t max_level = abcd[0]; + + // Standard feature flags + Cpuid(1, 0, abcd); + flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0; + flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0; + flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0; + flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0; + flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0; + flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0; + flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0; + flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0; + flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0; + flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0; + flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0; + has_osxsave = IsBitSet(abcd[2], 27); + + // Extended feature flags + Cpuid(0x80000001U, 0, abcd); + flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0; + + // Extended features + if (max_level >= 7) { + Cpuid(7, 0, abcd); + flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0; + flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0; + flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0; + + flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0; + flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0; + flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0; + flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0; + + flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0; + flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0; + flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0; + flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0; + flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0; + flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0; + flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0; + } + + // Set target bit(s) if all their group's flags are all set. + if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) { + bits |= HWY_AVX3_DL; + } + if ((flags & kGroupAVX3) == kGroupAVX3) { + bits |= HWY_AVX3; + } + if ((flags & kGroupAVX2) == kGroupAVX2) { + bits |= HWY_AVX2; + } + if ((flags & kGroupSSE4) == kGroupSSE4) { + bits |= HWY_SSE4; + } + if ((flags & kGroupSSSE3) == kGroupSSSE3) { + bits |= HWY_SSSE3; + } + } + + // Clear bits if the OS does not support XSAVE - otherwise, registers + // are not preserved across context switches. + if (has_osxsave) { + const uint32_t xcr0 = ReadXCR0(); + const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL; + const int64_t min_avx2 = HWY_AVX2 | min_avx3; + // XMM + if (!IsBitSet(xcr0, 1)) { + bits &= ~(HWY_SSSE3 | HWY_SSE4 | min_avx2); + } + // YMM + if (!IsBitSet(xcr0, 2)) { + bits &= ~min_avx2; + } + // opmask, ZMM lo/hi + if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) { + bits &= ~min_avx3; + } + } + + if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { + fprintf(stderr, + "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64 + "\n", + bits, static_cast(HWY_ENABLED_BASELINE)); + } + +#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH + using CapBits = unsigned long; // NOLINT + const CapBits hw = getauxval(AT_HWCAP); + (void)hw; + +#if HWY_ARCH_ARM_A64 + +#if defined(HWCAP_AES) + // aarch64 always has NEON and VFPv4, but not necessarily AES, which we + // require and thus must still check for. + if (hw & HWCAP_AES) { + bits |= HWY_NEON; + } +#endif // HWCAP_AES + +#if defined(HWCAP_SVE) + if (hw & HWCAP_SVE) { + bits |= HWY_SVE; + } +#endif + +#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES) + const CapBits hw2 = getauxval(AT_HWCAP2); + if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) { + bits |= HWY_SVE2; + } +#endif + +#else // HWY_ARCH_ARM_A64 + +// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported. +// Note that AES has a different HWCAP bit compared to aarch64. +#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4) + if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) { + bits |= HWY_NEON; + } +#endif + +#endif // HWY_ARCH_ARM_A64 + if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { + fprintf(stderr, + "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64 + "\n", + bits, static_cast(HWY_ENABLED_BASELINE)); + } +#else // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH + // TODO(janwas): detect for other platforms and check for baseline + // This file is typically compiled without HWY_IS_TEST, but targets_test has + // it set, and will expect all of its HWY_TARGETS (= all attainable) to be + // supported. + bits |= HWY_ENABLED_BASELINE; +#endif // HWY_ARCH_X86 + + return bits; +} + +} // namespace + +HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) + Abort(const char* file, int line, const char* format, ...) { + char buf[2000]; + va_list args; + va_start(args, format); + vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + + fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf); + +// If compiled with any sanitizer, they can also print a stack trace. +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN + __sanitizer_print_stack_trace(); +#endif // HWY_IS_* + fflush(stderr); + +// Now terminate the program: +#if HWY_ARCH_RVV + exit(1); // trap/abort just freeze Spike. +#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC + // Facilitates breaking into a debugger, but don't use this in non-debug + // builds because it looks like "illegal instruction", which is misleading. + __builtin_trap(); +#else + abort(); // Compile error without this due to HWY_NORETURN. +#endif +} + +HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { + supported_mask_ = static_cast(~disabled_targets); + // This will take effect on the next call to SupportedTargets, which is + // called right before GetChosenTarget::Update. However, calling Update here + // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want + // to check in tests. We instead de-initialize such that the next + // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache. + GetChosenTarget().DeInit(); +} + +HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) { + supported_targets_for_test_ = targets; + GetChosenTarget().DeInit(); // see comment above +} + +HWY_DLLEXPORT int64_t SupportedTargets() { + int64_t targets = supported_targets_for_test_; + if (HWY_LIKELY(targets == 0)) { + // Mock not active. Re-detect instead of caching just in case we're on a + // heterogeneous ISA (also requires some app support to pin threads). This + // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to + // DisableTargets or SetSupportedTargetsForTest. + targets = DetectTargets(); + + // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion, + // first set up ChosenTarget. No need to Update() again afterwards with the + // final targets - that will be done by a caller of this function. + GetChosenTarget().Update(targets); + + // Now that we can call VectorBytes, check for targets with specific sizes. + if (HWY_ARCH_ARM_A64) { + const size_t vec_bytes = VectorBytes(); // uncached, see declaration + if ((targets & HWY_SVE) && vec_bytes == 32) { + targets = static_cast(targets | HWY_SVE_256); + } else { + targets = static_cast(targets & ~HWY_SVE_256); + } + if ((targets & HWY_SVE2) && vec_bytes == 16) { + targets = static_cast(targets | HWY_SVE2_128); + } else { + targets = static_cast(targets & ~HWY_SVE2_128); + } + } // HWY_ARCH_ARM_A64 + } + + targets &= supported_mask_; + return targets == 0 ? HWY_STATIC_TARGET : targets; +} + +HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { + static ChosenTarget chosen_target; + return chosen_target; +} + +} // namespace hwy diff --git a/hwy/targets.h b/hwy/targets.h new file mode 100644 index 0000000..2d9afbf --- /dev/null +++ b/hwy/targets.h @@ -0,0 +1,318 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_TARGETS_H_ +#define HIGHWAY_HWY_TARGETS_H_ + +#include + +// For SIMD module implementations and their callers. Defines which targets to +// generate and call. + +#include "hwy/base.h" +#include "hwy/detect_targets.h" +#include "hwy/highway_export.h" + +#if !HWY_ARCH_RVV +#include +#endif + +namespace hwy { + +// Returns bitfield of enabled targets that are supported on this CPU; there is +// always at least one such target, hence the return value is never 0. The +// targets returned may change after calling DisableTargets. This function is +// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding +// calls to it if there is only a single target enabled. +HWY_DLLEXPORT int64_t SupportedTargets(); + +// Evaluates to a function call, or literal if there is a single target. +#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0 +#define HWY_SUPPORTED_TARGETS HWY_TARGETS +#else +#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets() +#endif + +// Subsequent SupportedTargets will not return targets whose bit(s) are set in +// `disabled_targets`. Exception: if SupportedTargets would return 0, it will +// instead return HWY_STATIC_TARGET (there must always be one target to call). +// +// This function is useful for disabling targets known to be buggy, or if the +// best available target is undesirable (perhaps due to throttling or memory +// bandwidth limitations). Use SetSupportedTargetsForTest instead of this +// function for iteratively enabling specific targets for testing. +HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets); + +// Subsequent SupportedTargets will return the given set of targets, except +// those disabled via DisableTargets. Call with a mask of 0 to disable the mock +// and return to the normal SupportedTargets behavior. Used to run tests for +// all targets. +HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets); + +// Return the list of targets in HWY_TARGETS supported by the CPU as a list of +// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list +// is affected by the current SetSupportedTargetsForTest() mock if any. +HWY_INLINE std::vector SupportedAndGeneratedTargets() { + std::vector ret; + for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0; + targets = targets & (targets - 1)) { + int64_t current_target = targets & ~(targets - 1); + ret.push_back(current_target); + } + return ret; +} + +static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { + switch (target) { +#if HWY_ARCH_X86 + case HWY_SSSE3: + return "SSSE3"; + case HWY_SSE4: + return "SSE4"; + case HWY_AVX2: + return "AVX2"; + case HWY_AVX3: + return "AVX3"; + case HWY_AVX3_DL: + return "AVX3_DL"; +#endif + +#if HWY_ARCH_ARM + case HWY_SVE2_128: + return "SVE2_128"; + case HWY_SVE_256: + return "SVE_256"; + case HWY_SVE2: + return "SVE2"; + case HWY_SVE: + return "SVE"; + case HWY_NEON: + return "NEON"; +#endif + +#if HWY_ARCH_PPC + case HWY_PPC8: + return "PPC8"; +#endif + +#if HWY_ARCH_WASM + case HWY_WASM: + return "WASM"; + case HWY_WASM_EMU256: + return "WASM_EMU256"; +#endif + +#if HWY_ARCH_RVV + case HWY_RVV: + return "RVV"; +#endif + + case HWY_EMU128: + return "EMU128"; + case HWY_SCALAR: + return "SCALAR"; + + default: + return "Unknown"; // must satisfy gtest IsValidParamName() + } +} + +// The maximum number of dynamic targets on any architecture is defined by +// HWY_MAX_DYNAMIC_TARGETS and depends on the arch. + +// For the ChosenTarget mask and index we use a different bit arrangement than +// in the HWY_TARGETS mask. Only the targets involved in the current +// architecture are used in this mask, and therefore only the least significant +// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least +// significant bit is set when the mask is not initialized, the next +// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the +// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to +// that position and the next more significant bit is used for HWY_SCALAR (if +// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to +// define equivalent values for HWY_TARGETS in this representation. +// This mask representation allows to use ctz() on this mask and obtain a small +// number that's used as an index of the table for dynamic dispatch. In this +// way the first entry is used when the mask is uninitialized, the following +// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for +// scalar. + +// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format. +#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1)) + +// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the +// current architecture. +#define HWY_CHOSEN_TARGET_SHIFT(X) \ + ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \ + ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \ + << 1) + +// The HWY_TARGETS mask in the ChosenTarget mask format. +#define HWY_CHOSEN_TARGET_MASK_TARGETS \ + (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL) + +#if HWY_ARCH_X86 +// Maximum number of dynamic targets, changing this value is an ABI incompatible +// change +#define HWY_MAX_DYNAMIC_TARGETS 15 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86 +// These must match the order in which the HWY_TARGETS are defined +// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 - +// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly +// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry +// corresponds to the best target. Don't include a "," at the end of the list. +#define HWY_CHOOSE_TARGET_LIST(func_name) \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \ + HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \ + HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \ + nullptr, /* AVX */ \ + HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \ + HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \ + nullptr , /* reserved - SSE3? */ \ + nullptr /* reserved - SSE2? */ + +#elif HWY_ARCH_ARM +// See HWY_ARCH_X86 above for details. +#define HWY_MAX_DYNAMIC_TARGETS 15 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM +#define HWY_CHOOSE_TARGET_LIST(func_name) \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \ + HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \ + HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \ + HWY_CHOOSE_SVE(func_name), /* SVE */ \ + HWY_CHOOSE_NEON(func_name), /* NEON */ \ + nullptr /* reserved - Helium? */ + +#elif HWY_ARCH_RVV +// See HWY_ARCH_X86 above for details. +#define HWY_MAX_DYNAMIC_TARGETS 9 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV +#define HWY_CHOOSE_TARGET_LIST(func_name) \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + HWY_CHOOSE_RVV(func_name), /* RVV */ \ + nullptr /* reserved */ + +#elif HWY_ARCH_PPC +// See HWY_ARCH_X86 above for details. +#define HWY_MAX_DYNAMIC_TARGETS 9 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC +#define HWY_CHOOSE_TARGET_LIST(func_name) \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \ + nullptr, /* reserved (VSX or AltiVec) */ \ + nullptr /* reserved (VSX or AltiVec) */ + +#elif HWY_ARCH_WASM +// See HWY_ARCH_X86 above for details. +#define HWY_MAX_DYNAMIC_TARGETS 9 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM +#define HWY_CHOOSE_TARGET_LIST(func_name) \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \ + HWY_CHOOSE_WASM(func_name), /* WASM */ \ + nullptr /* reserved */ + +#else +// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though +// still creating single-entry tables in HWY_EXPORT to ensure portability. +#define HWY_MAX_DYNAMIC_TARGETS 1 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR +#endif + +// Bitfield of supported and enabled targets. The format differs from that of +// HWY_TARGETS; the lowest bit governs the first function pointer (which is +// special in that it calls FunctionCache, then Update, then dispatches to the +// actual implementation) in the tables created by HWY_EXPORT. Monostate (see +// GetChosenTarget), thread-safe except on RVV. +struct ChosenTarget { + public: + // Reset bits according to `targets` (typically the return value of + // SupportedTargets()). Postcondition: IsInitialized() == true. + void Update(int64_t targets) { + // These are `targets` shifted downwards, see above. Also include SCALAR + // (corresponds to the last entry in the function table) as fallback. + StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR); + } + + // Reset to the uninitialized state, so that FunctionCache will call Update + // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false. + void DeInit() { StoreMask(1); } + + // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH + // function was called, which we check in tests. + bool IsInitialized() const { return LoadMask() != 1; } + + // Return the index in the dynamic dispatch table to be used by the current + // CPU. Note that this method must be in the header file so it uses the value + // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that + // calls it, which may be different from others. This means we only enable + // those targets that were actually compiled in this module. + size_t HWY_INLINE GetIndex() const { + return hwy::Num0BitsBelowLS1Bit_Nonzero64( + static_cast(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS)); + } + + private: + // TODO(janwas): remove #if once is available +#if HWY_ARCH_RVV + int64_t LoadMask() const { return mask_; } + void StoreMask(int64_t mask) { mask_ = mask; } + + int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0. +#else + int64_t LoadMask() const { return mask_.load(); } + void StoreMask(int64_t mask) { mask_.store(mask); } + + std::atomic mask_{1}; // Initialized to 1 so GetIndex() returns 0. +#endif // HWY_ARCH_RVV +}; + +// For internal use (e.g. by FunctionCache and DisableTargets). +HWY_DLLEXPORT ChosenTarget& GetChosenTarget(); + +} // namespace hwy + +#endif // HIGHWAY_HWY_TARGETS_H_ diff --git a/hwy/targets_test.cc b/hwy/targets_test.cc new file mode 100644 index 0000000..e58a6fa --- /dev/null +++ b/hwy/targets_test.cc @@ -0,0 +1,135 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/targets.h" + +#include "hwy/tests/test_util-inl.h" + +namespace fake { + +#define DECLARE_FUNCTION(TGT) \ + namespace N_##TGT { \ + /* Function argument is just to ensure/demonstrate they are possible. */ \ + int64_t FakeFunction(int) { return HWY_##TGT; } \ + } + +DECLARE_FUNCTION(AVX3_DL) +DECLARE_FUNCTION(AVX3) +DECLARE_FUNCTION(AVX2) +DECLARE_FUNCTION(SSE4) +DECLARE_FUNCTION(SSSE3) +DECLARE_FUNCTION(NEON) +DECLARE_FUNCTION(SVE) +DECLARE_FUNCTION(SVE2) +DECLARE_FUNCTION(SVE_256) +DECLARE_FUNCTION(SVE2_128) +DECLARE_FUNCTION(PPC8) +DECLARE_FUNCTION(WASM) +DECLARE_FUNCTION(RVV) +DECLARE_FUNCTION(SCALAR) +DECLARE_FUNCTION(EMU128) + +HWY_EXPORT(FakeFunction); + +void CallFunctionForTarget(int64_t target, int line) { + if ((HWY_TARGETS & target) == 0) return; + hwy::SetSupportedTargetsForTest(target); + + // Call Update() first to make &HWY_DYNAMIC_DISPATCH() return + // the pointer to the already cached function. + hwy::GetChosenTarget().Update(hwy::SupportedTargets()); + + EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line; + + // Calling DeInit() will test that the initializer function + // also calls the right function. + hwy::GetChosenTarget().DeInit(); + +#if HWY_DISPATCH_WORKAROUND + EXPECT_EQ(HWY_STATIC_TARGET, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line; +#else + EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line; +#endif + + // Second call uses the cached value from the previous call. + EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line; +} + +void CheckFakeFunction() { + // When adding a target, also add to DECLARE_FUNCTION above. + CallFunctionForTarget(HWY_AVX3_DL, __LINE__); + CallFunctionForTarget(HWY_AVX3, __LINE__); + CallFunctionForTarget(HWY_AVX2, __LINE__); + CallFunctionForTarget(HWY_SSE4, __LINE__); + CallFunctionForTarget(HWY_SSSE3, __LINE__); + CallFunctionForTarget(HWY_NEON, __LINE__); + CallFunctionForTarget(HWY_SVE, __LINE__); + CallFunctionForTarget(HWY_SVE2, __LINE__); + CallFunctionForTarget(HWY_SVE_256, __LINE__); + CallFunctionForTarget(HWY_SVE2_128, __LINE__); + CallFunctionForTarget(HWY_PPC8, __LINE__); + CallFunctionForTarget(HWY_WASM, __LINE__); + CallFunctionForTarget(HWY_RVV, __LINE__); + // The tables only have space for either HWY_SCALAR or HWY_EMU128; the former + // is opt-in only. +#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128 + CallFunctionForTarget(HWY_SCALAR, __LINE__); +#else + CallFunctionForTarget(HWY_EMU128, __LINE__); +#endif +} + +} // namespace fake + +namespace hwy { + +class HwyTargetsTest : public testing::Test { + protected: + void TearDown() override { + SetSupportedTargetsForTest(0); + DisableTargets(0); // Reset the mask. + } +}; + +// Test that the order in the HWY_EXPORT static array matches the expected +// value of the target bits. This is only checked for the targets that are +// enabled in the current compilation. +TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); } + +TEST_F(HwyTargetsTest, DisabledTargetsTest) { + DisableTargets(~0LL); + // Check that disabling everything at least leaves the static target. + HWY_ASSERT(HWY_STATIC_TARGET == SupportedTargets()); + + DisableTargets(0); // Reset the mask. + const int64_t current_targets = SupportedTargets(); + const int64_t enabled_baseline = static_cast(HWY_ENABLED_BASELINE); + // Exclude these two because they are always returned by SupportedTargets. + const int64_t fallback = HWY_SCALAR | HWY_EMU128; + if ((current_targets & ~enabled_baseline & ~fallback) == 0) { + // We can't test anything else if the only compiled target is the baseline. + return; + } + + // Get the lowest bit in the mask (the best target) and disable that one. + const int64_t best_target = current_targets & (~current_targets + 1); + DisableTargets(best_target); + + // Check that the other targets are still enabled. + HWY_ASSERT((best_target ^ current_targets) == SupportedTargets()); + DisableTargets(0); // Reset the mask. +} + +} // namespace hwy diff --git a/hwy/tests/arithmetic_test.cc b/hwy/tests/arithmetic_test.cc new file mode 100644 index 0000000..1fbbd29 --- /dev/null +++ b/hwy/tests/arithmetic_test.cc @@ -0,0 +1,445 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestPlusMinus { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v2 = Iota(d, T(2)); + const auto v3 = Iota(d, T(3)); + const auto v4 = Iota(d, T(4)); + + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + lanes[i] = static_cast((2 + i) + (3 + i)); + } + HWY_ASSERT_VEC_EQ(d, lanes.get(), Add(v2, v3)); + HWY_ASSERT_VEC_EQ(d, Set(d, 2), Sub(v4, v2)); + + for (size_t i = 0; i < N; ++i) { + lanes[i] = static_cast((2 + i) + (4 + i)); + } + auto sum = v2; + sum = Add(sum, v4); // sum == 6,8.. + HWY_ASSERT_VEC_EQ(d, Load(d, lanes.get()), sum); + + sum = Sub(sum, v4); + HWY_ASSERT_VEC_EQ(d, v2, sum); + } +}; + +HWY_NOINLINE void TestAllPlusMinus() { + ForAllTypes(ForPartialVectors()); +} + +struct TestUnsignedSaturatingArithmetic { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vi = Iota(d, 1); + const auto vm = Set(d, LimitsMax()); + + HWY_ASSERT_VEC_EQ(d, Add(v0, v0), SaturatedAdd(v0, v0)); + HWY_ASSERT_VEC_EQ(d, Add(v0, vi), SaturatedAdd(v0, vi)); + HWY_ASSERT_VEC_EQ(d, Add(v0, vm), SaturatedAdd(v0, vm)); + HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vi, vm)); + HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vm, vm)); + + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vi)); + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vm)); + HWY_ASSERT_VEC_EQ(d, Sub(vm, vi), SaturatedSub(vm, vi)); + } +}; + +struct TestSignedSaturatingArithmetic { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vpm = Set(d, LimitsMax()); + // Ensure all lanes are positive, even if Iota wraps around + const auto vi = Or(And(Iota(d, 0), vpm), Set(d, 1)); + const auto vn = Sub(v0, vi); + const auto vnm = Set(d, LimitsMin()); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Gt(vi, v0)); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Lt(vn, v0)); + + HWY_ASSERT_VEC_EQ(d, v0, SaturatedAdd(v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, SaturatedAdd(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(v0, vpm)); + HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vi, vpm)); + HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vpm, vpm)); + + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0)); + HWY_ASSERT_VEC_EQ(d, Sub(v0, vi), SaturatedSub(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vn, SaturatedSub(vn, v0)); + HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vi)); + HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vpm)); + } +}; + +HWY_NOINLINE void TestAllSaturatingArithmetic() { + const ForPartialVectors test_unsigned; + test_unsigned(uint8_t()); + test_unsigned(uint16_t()); + + const ForPartialVectors test_signed; + test_signed(int8_t()); + test_signed(int16_t()); +} + +struct TestAverage { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto v1 = Set(d, T(1)); + const auto v2 = Set(d, T(2)); + + HWY_ASSERT_VEC_EQ(d, v0, AverageRound(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v0, v1)); + HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v1, v1)); + HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v2, v2)); + } +}; + +HWY_NOINLINE void TestAllAverage() { + const ForPartialVectors test; + test(uint8_t()); + test(uint16_t()); +} + +struct TestAbs { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp1 = Set(d, T(1)); + const auto vn1 = Set(d, T(-1)); + const auto vpm = Set(d, LimitsMax()); + const auto vnm = Set(d, LimitsMin()); + + HWY_ASSERT_VEC_EQ(d, v0, Abs(v0)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1)); + HWY_ASSERT_VEC_EQ(d, vpm, Abs(vpm)); + HWY_ASSERT_VEC_EQ(d, vnm, Abs(vnm)); + } +}; + +struct TestFloatAbs { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp1 = Set(d, T(1)); + const auto vn1 = Set(d, T(-1)); + const auto vp2 = Set(d, T(0.01)); + const auto vn2 = Set(d, T(-0.01)); + + HWY_ASSERT_VEC_EQ(d, v0, Abs(v0)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1)); + HWY_ASSERT_VEC_EQ(d, vp2, Abs(vp2)); + HWY_ASSERT_VEC_EQ(d, vp2, Abs(vn2)); + } +}; + +HWY_NOINLINE void TestAllAbs() { + ForSignedTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); +} + +struct TestNeg { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vn = Set(d, T(-3)); + const auto vp = Set(d, T(3)); + HWY_ASSERT_VEC_EQ(d, v0, Neg(v0)); + HWY_ASSERT_VEC_EQ(d, vp, Neg(vn)); + HWY_ASSERT_VEC_EQ(d, vn, Neg(vp)); + } +}; + +HWY_NOINLINE void TestAllNeg() { + ForSignedTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); +} + +struct TestUnsignedMinMax { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + // Leave headroom such that v1 < v2 even after wraparound. + const auto mod = And(Iota(d, 0), Set(d, LimitsMax() >> 1)); + const auto v1 = Add(mod, Set(d, 1)); + const auto v2 = Add(mod, Set(d, 2)); + HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v0, Min(v1, v0)); + HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v0)); + + const auto vmin = Set(d, LimitsMin()); + const auto vmax = Set(d, LimitsMax()); + + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin)); + + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin)); + } +}; + +struct TestSignedMinMax { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Leave headroom such that v1 < v2 even after wraparound. + const auto mod = And(Iota(d, 0), Set(d, LimitsMax() >> 1)); + const auto v1 = Add(mod, Set(d, 1)); + const auto v2 = Add(mod, Set(d, 2)); + const auto v_neg = Sub(Zero(d), v1); + HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg)); + HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg)); + + const auto v0 = Zero(d); + const auto vmin = Set(d, LimitsMin()); + const auto vmax = Set(d, LimitsMax()); + HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0)); + + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin)); + + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin)); + } +}; + +struct TestFloatMinMax { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Iota(d, 1); + const auto v2 = Iota(d, 2); + const auto v_neg = Iota(d, -T(Lanes(d))); + HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg)); + HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg)); + + const auto v0 = Zero(d); + const auto vmin = Set(d, T(-1E30)); + const auto vmax = Set(d, T(1E30)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0)); + + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin)); + + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin)); + } +}; + +HWY_NOINLINE void TestAllMinMax() { + ForUnsignedTypes(ForPartialVectors()); + ForSignedTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); +} + +template +static HWY_NOINLINE Vec Make128(D d, uint64_t hi, uint64_t lo) { + alignas(16) uint64_t in[2]; + in[0] = lo; + in[1] = hi; + return LoadDup128(d, in); +} + +struct TestMinMax128 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec; + const size_t N = Lanes(d); + auto a_lanes = AllocateAligned(N); + auto b_lanes = AllocateAligned(N); + auto min_lanes = AllocateAligned(N); + auto max_lanes = AllocateAligned(N); + RandomState rng; + + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + // Same arg + HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11)); + HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11)); + + // First arg less + HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10)); + HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11)); + HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11)); + + // Second arg less + HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10)); + HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01)); + HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10)); + + // Also check 128-bit blocks are independent + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + a_lanes[i] = Random64(&rng); + b_lanes[i] = Random64(&rng); + } + const V a = Load(d, a_lanes.get()); + const V b = Load(d, b_lanes.get()); + for (size_t i = 0; i < N; i += 2) { + const bool lt = a_lanes[i + 1] == b_lanes[i + 1] + ? (a_lanes[i] < b_lanes[i]) + : (a_lanes[i + 1] < b_lanes[i + 1]); + min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0]; + min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1]; + max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0]; + max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1]; + } + HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b)); + HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b)); + } + } +}; + +HWY_NOINLINE void TestAllMinMax128() { + ForGEVectors<128, TestMinMax128>()(uint64_t()); +} + +struct TestMinMax128Upper { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec; + const size_t N = Lanes(d); + auto a_lanes = AllocateAligned(N); + auto b_lanes = AllocateAligned(N); + auto min_lanes = AllocateAligned(N); + auto max_lanes = AllocateAligned(N); + RandomState rng; + + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + // Same arg + HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11)); + HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11)); + + // Equivalent but not equal (chooses second arg) + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11)); + HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10)); + HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10)); + HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11)); + + // First arg less + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10)); + + // Second arg less + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01)); + + // Also check 128-bit blocks are independent + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + a_lanes[i] = Random64(&rng); + b_lanes[i] = Random64(&rng); + } + const V a = Load(d, a_lanes.get()); + const V b = Load(d, b_lanes.get()); + for (size_t i = 0; i < N; i += 2) { + const bool lt = a_lanes[i + 1] < b_lanes[i + 1]; + min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0]; + min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1]; + max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0]; + max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1]; + } + HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b)); + HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b)); + } + } +}; + +HWY_NOINLINE void TestAllMinMax128Upper() { + ForGEVectors<128, TestMinMax128Upper>()(uint64_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyArithmeticTest); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper); +} // namespace hwy + +#endif diff --git a/hwy/tests/blockwise_shift_test.cc b/hwy/tests/blockwise_shift_test.cc new file mode 100644 index 0000000..d14fb86 --- /dev/null +++ b/hwy/tests/blockwise_shift_test.cc @@ -0,0 +1,268 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // memcpy + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestShiftBytes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define Shift*Bytes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + const Repartition du8; + const size_t N8 = Lanes(du8); + + // Zero remains zero + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0)); + + // Zero after shifting out the high/low byte + auto bytes = AllocateAligned(N8); + std::fill(bytes.get(), bytes.get() + N8, 0); + bytes[N8 - 1] = 0x7F; + const auto vhi = BitCast(d, Load(du8, bytes.get())); + bytes[N8 - 1] = 0; + bytes[0] = 0x7F; + const auto vlo = BitCast(d, Load(du8, bytes.get())); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo)); + + // Check expected result with Iota + const size_t N = Lanes(d); + auto in = AllocateAligned(N); + const uint8_t* in_bytes = reinterpret_cast(in.get()); + const auto v = BitCast(d, Iota(du8, 1)); + Store(v, d, in.get()); + + auto expected = AllocateAligned(N); + uint8_t* expected_bytes = reinterpret_cast(expected.get()); + + const size_t block_size = HWY_MIN(N8, 16); + for (size_t block = 0; block < N8; block += block_size) { + expected_bytes[block] = 0; + memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v)); + + for (size_t block = 0; block < N8; block += block_size) { + memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1); + expected_bytes[block + block_size - 1] = 0; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v)); +#else + (void)d; +#endif // #if HWY_TARGET != HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllShiftBytes() { + ForIntegerTypes(ForPartialVectors()); +} + +struct TestShiftLeftLanes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define Shift*Lanes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + const auto v = Iota(d, T(1)); + const size_t N = Lanes(d); + if (N == 1) return; + auto expected = AllocateAligned(N); + + HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v)); + HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v)); + + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + + for (size_t i = 0; i < N; ++i) { + expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v)); +#else + (void)d; +#endif // #if HWY_TARGET != HWY_SCALAR + } +}; + +struct TestShiftRightLanes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define Shift*Lanes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + const auto v = Iota(d, T(1)); + const size_t N = Lanes(d); + if (N == 1) return; + auto expected = AllocateAligned(N); + + HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v)); + + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + + for (size_t i = 0; i < N; ++i) { + const size_t mod = i % kLanesPerBlock; + expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v)); +#else + (void)d; +#endif // #if HWY_TARGET != HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllShiftLeftLanes() { + ForAllTypes(ForPartialVectors()); +} + +HWY_NOINLINE void TestAllShiftRightLanes() { + ForAllTypes(ForPartialVectors()); +} + +// Scalar does not define CombineShiftRightBytes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + +template +struct TestCombineShiftRightBytes { + template + HWY_NOINLINE void operator()(T, D d) { + constexpr size_t kBlockSize = 16; + static_assert(kBytes < kBlockSize, "Shift count is per block"); + const Repartition d8; + const size_t N8 = Lanes(d8); + if (N8 < 16) return; + auto hi_bytes = AllocateAligned(N8); + auto lo_bytes = AllocateAligned(N8); + auto expected_bytes = AllocateAligned(N8); + uint8_t combined[2 * kBlockSize]; + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(100); ++rep) { + for (size_t i = 0; i < N8; ++i) { + hi_bytes[i] = static_cast(Random64(&rng) & 0xFF); + lo_bytes[i] = static_cast(Random64(&rng) & 0xFF); + } + for (size_t i = 0; i < N8; i += kBlockSize) { + // Arguments are not the same size. + CopyBytes(&lo_bytes[i], combined); + CopyBytes(&hi_bytes[i], combined + kBlockSize); + CopyBytes(combined + kBytes, &expected_bytes[i]); + } + + const auto hi = BitCast(d, Load(d8, hi_bytes.get())); + const auto lo = BitCast(d, Load(d8, lo_bytes.get())); + const auto expected = BitCast(d, Load(d8, expected_bytes.get())); + HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes(d, hi, lo)); + } + } +}; + +template +struct TestCombineShiftRightLanes { + template + HWY_NOINLINE void operator()(T, D d) { + const Repartition d8; + const size_t N8 = Lanes(d8); + if (N8 < 16) return; + + auto hi_bytes = AllocateAligned(N8); + auto lo_bytes = AllocateAligned(N8); + auto expected_bytes = AllocateAligned(N8); + constexpr size_t kBlockSize = 16; + uint8_t combined[2 * kBlockSize]; + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(100); ++rep) { + for (size_t i = 0; i < N8; ++i) { + hi_bytes[i] = static_cast(Random64(&rng) & 0xFF); + lo_bytes[i] = static_cast(Random64(&rng) & 0xFF); + } + for (size_t i = 0; i < N8; i += kBlockSize) { + // Arguments are not the same size. + CopyBytes(&lo_bytes[i], combined); + CopyBytes(&hi_bytes[i], combined + kBlockSize); + CopyBytes(combined + kLanes * sizeof(T), + &expected_bytes[i]); + } + + const auto hi = BitCast(d, Load(d8, hi_bytes.get())); + const auto lo = BitCast(d, Load(d8, lo_bytes.get())); + const auto expected = BitCast(d, Load(d8, expected_bytes.get())); + HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes(d, hi, lo)); + } + } +}; + +#endif // #if HWY_TARGET != HWY_SCALAR + +struct TestCombineShiftRight { + template + HWY_NOINLINE void operator()(T t, D d) { +// Scalar does not define CombineShiftRightBytes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + constexpr int kMaxBytes = + HWY_MIN(16, static_cast(MaxLanes(d) * sizeof(T))); + constexpr int kMaxLanes = kMaxBytes / static_cast(sizeof(T)); + TestCombineShiftRightBytes()(t, d); + TestCombineShiftRightBytes()(t, d); + TestCombineShiftRightBytes<1>()(t, d); + + TestCombineShiftRightLanes()(t, d); + TestCombineShiftRightLanes()(t, d); + TestCombineShiftRightLanes<1>()(t, d); +#else + (void)t; + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllCombineShiftRight() { + // Need at least 2 lanes. + ForAllTypes(ForShrinkableVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyBlockwiseShiftTest); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftBytes); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftLeftLanes); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftRightLanes); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllCombineShiftRight); +} // namespace hwy + +#endif diff --git a/hwy/tests/blockwise_test.cc b/hwy/tests/blockwise_test.cc new file mode 100644 index 0000000..41097ee --- /dev/null +++ b/hwy/tests/blockwise_test.cc @@ -0,0 +1,452 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +struct TestBroadcastR { + HWY_NOINLINE void operator()() const { + using T = typename D::T; + const D d; + const size_t N = Lanes(d); + if (kLane >= N) return; + auto in_lanes = AllocateAligned(N); + std::fill(in_lanes.get(), in_lanes.get() + N, T(0)); + const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T); + // Need to set within each 128-bit block + for (size_t block = 0; block < N; block += blockN) { + in_lanes[block + kLane] = static_cast(block + 1); + } + const auto in = Load(d, in_lanes.get()); + auto expected = AllocateAligned(N); + for (size_t block = 0; block < N; block += blockN) { + for (size_t i = 0; i < blockN; ++i) { + expected[block + i] = T(block + 1); + } + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast(in)); + + TestBroadcastR()(); + } +}; + +template +struct TestBroadcastR { + void operator()() const {} +}; + +struct TestBroadcast { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + TestBroadcastR()(); + } +}; + +HWY_NOINLINE void TestAllBroadcast() { + const ForPartialVectors test; + // No u/i8. + test(uint16_t()); + test(int16_t()); + ForUIF3264(test); +} + +template +struct ChooseTableSize { + template + using type = DIdx; +}; +template <> +struct ChooseTableSize { + template + using type = ScalableTag; +}; + +template +struct TestTableLookupBytes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + RandomState rng; + + const typename ChooseTableSize::template type d_tbl; + const Repartition d_tbl8; + const size_t NT8 = Lanes(d_tbl8); + + const Repartition d8; + const size_t N8 = Lanes(d8); + + // Random input bytes + auto in_bytes = AllocateAligned(NT8); + for (size_t i = 0; i < NT8; ++i) { + in_bytes[i] = Random32(&rng) & 0xFF; + } + const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get())); + + // Enough test data; for larger vectors, upper lanes will be zero. + const uint8_t index_bytes_source[64] = { + // Same index as source, multiple outputs from same input, + // unused input (9), ascending/descending and nonconsecutive neighbors. + 0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11, + 11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0, + 4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1}; + auto index_bytes = AllocateAligned(N8); + const size_t max_index = HWY_MIN(NT8, 16) - 1; + for (size_t i = 0; i < N8; ++i) { + index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0; + // Avoid asan error for partial vectors. + index_bytes[i] = static_cast(HWY_MIN(index_bytes[i], max_index)); + } + const auto indices = Load(d, reinterpret_cast(index_bytes.get())); + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + uint8_t* expected_bytes = reinterpret_cast(expected.get()); + + for (size_t block = 0; block < N8; block += 16) { + for (size_t i = 0; i < 16 && (block + i) < N8; ++i) { + const uint8_t index = index_bytes[block + i]; + HWY_ASSERT(index <= max_index); + // Note that block + index may exceed NT8 on RVV, which is fine because + // the operation uses the larger of the table and index vector size. + HWY_ASSERT(block + index < HWY_MAX(N8, NT8)); + // For large vectors, the lane index may wrap around due to block, + // also wrap around after 8-bit overflow. + expected_bytes[block + i] = + in_bytes[(block + index) % HWY_MIN(NT8, 256)]; + } + } + HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices)); + + // Individually test zeroing each byte position. + for (size_t i = 0; i < N8; ++i) { + const uint8_t prev_expected = expected_bytes[i]; + const uint8_t prev_index = index_bytes[i]; + expected_bytes[i] = 0; + + const int idx = 0x80 + (static_cast(Random32(&rng) & 7) << 4); + HWY_ASSERT(0x80 <= idx && idx < 256); + index_bytes[i] = static_cast(idx); + + const auto indices = + Load(d, reinterpret_cast(index_bytes.get())); + HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices)); + expected_bytes[i] = prev_expected; + index_bytes[i] = prev_index; + } +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllTableLookupBytesSame() { + // Partial index, same-sized table. + ForIntegerTypes(ForPartialVectors>()); +} + +HWY_NOINLINE void TestAllTableLookupBytesMixed() { + // Partial index, full-size table. + ForIntegerTypes(ForPartialVectors>()); +} + +struct TestInterleaveLower { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TU = MakeUnsigned; + const size_t N = Lanes(d); + auto even_lanes = AllocateAligned(N); + auto odd_lanes = AllocateAligned(N); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast(2 * i + 0); + odd_lanes[i] = static_cast(2 * i + 1); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(16 / sizeof(T), N); + for (size_t i = 0; i < Lanes(d); ++i) { + const size_t block = i / blockN; + const size_t index = (i % blockN) + block * 2 * blockN; + expected[i] = static_cast(index & LimitsMax()); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd)); + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd)); + } +}; + +struct TestInterleaveUpper { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + if (N == 1) return; + auto even_lanes = AllocateAligned(N); + auto odd_lanes = AllocateAligned(N); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast(2 * i + 0); + odd_lanes[i] = static_cast(2 * i + 1); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(16 / sizeof(T), N); + for (size_t i = 0; i < Lanes(d); ++i) { + const size_t block = i / blockN; + expected[i] = T((i % blockN) + block * 2 * blockN + blockN); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd)); + } +}; + +HWY_NOINLINE void TestAllInterleave() { + // Not DemoteVectors because this cannot be supported by HWY_SCALAR. + ForAllTypes(ForShrinkableVectors()); + ForAllTypes(ForShrinkableVectors()); +} + +struct TestZipLower { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using WideT = MakeWide; + static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); + static_assert(IsSigned() == IsSigned(), "Must have same sign"); + const size_t N = Lanes(d); + auto even_lanes = AllocateAligned(N); + auto odd_lanes = AllocateAligned(N); + // At least 2 lanes for HWY_SCALAR + auto zip_lanes = AllocateAligned(HWY_MAX(N, 2)); + const T kMaxT = LimitsMax(); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast((2 * i + 0) & kMaxT); + odd_lanes[i] = static_cast((2 * i + 1) & kMaxT); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const Repartition dw; +#if HWY_TARGET == HWY_SCALAR + // Safely handle big-endian + const auto expected = Set(dw, static_cast(1ULL << (sizeof(T) * 8))); +#else + const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); + for (size_t i = 0; i < N; i += 2) { + const size_t base = (i / blockN) * blockN; + const size_t mod = i % blockN; + zip_lanes[i + 0] = even_lanes[mod / 2 + base]; + zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; + } + const auto expected = + Load(dw, reinterpret_cast(zip_lanes.get())); +#endif // HWY_TARGET == HWY_SCALAR + HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd)); + HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd)); + } +}; + +HWY_NOINLINE void TestAllZipLower() { + const ForDemoteVectors lower_unsigned; + lower_unsigned(uint8_t()); + lower_unsigned(uint16_t()); +#if HWY_HAVE_INTEGER64 + lower_unsigned(uint32_t()); // generates u64 +#endif + + const ForDemoteVectors lower_signed; + lower_signed(int8_t()); + lower_signed(int16_t()); +#if HWY_HAVE_INTEGER64 + lower_signed(int32_t()); // generates i64 +#endif + + // No float - concatenating f32 does not result in a f64 +} + +// Remove this test (so it does not show as having run) if the only target is +// HWY_SCALAR, which does not support this op. +#if HWY_TARGETS != HWY_SCALAR + +struct TestZipUpper { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET == HWY_SCALAR + (void)d; +#else + using WideT = MakeWide; + static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); + static_assert(IsSigned() == IsSigned(), "Must have same sign"); + const size_t N = Lanes(d); + if (N < 16 / sizeof(T)) return; + auto even_lanes = AllocateAligned(N); + auto odd_lanes = AllocateAligned(N); + auto zip_lanes = AllocateAligned(N); + const T kMaxT = LimitsMax(); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast((2 * i + 0) & kMaxT); + odd_lanes[i] = static_cast((2 * i + 1) & kMaxT); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); + + for (size_t i = 0; i < N; i += 2) { + const size_t base = (i / blockN) * blockN + blockN / 2; + const size_t mod = i % blockN; + zip_lanes[i + 0] = even_lanes[mod / 2 + base]; + zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; + } + const Repartition dw; + const auto expected = + Load(dw, reinterpret_cast(zip_lanes.get())); + HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd)); +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllZipUpper() { + const ForShrinkableVectors upper_unsigned; + upper_unsigned(uint8_t()); + upper_unsigned(uint16_t()); +#if HWY_HAVE_INTEGER64 + upper_unsigned(uint32_t()); // generates u64 +#endif + + const ForShrinkableVectors upper_signed; + upper_signed(int8_t()); + upper_signed(int16_t()); +#if HWY_HAVE_INTEGER64 + upper_signed(int32_t()); // generates i64 +#endif + + // No float - concatenating f32 does not result in a f64 +} + +#endif // HWY_TARGETS != HWY_SCALAR + +class TestSpecialShuffle32 { + public: + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, 0); + VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__); + } + + private: + // HWY_INLINE works around a Clang SVE compiler bug where all but the first + // 128 bits (the NEON register) of actual are zero. + template + HWY_INLINE void VerifyLanes32(D d, VecArg actual, const size_t i3, + const size_t i2, const size_t i1, + const size_t i0, const char* filename, + const int line) { + using T = TFromD; + constexpr size_t kBlockN = 16 / sizeof(T); + const size_t N = Lanes(d); + if (N < 4) return; + auto expected = AllocateAligned(N); + for (size_t block = 0; block < N; block += kBlockN) { + expected[block + 3] = static_cast(block + i3); + expected[block + 2] = static_cast(block + i2); + expected[block + 1] = static_cast(block + i1); + expected[block + 0] = static_cast(block + i0); + } + AssertVecEqual(d, expected.get(), actual, filename, line); + } +}; + +class TestSpecialShuffle64 { + public: + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, 0); + VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__); + } + + private: + // HWY_INLINE works around a Clang SVE compiler bug where all but the first + // 128 bits (the NEON register) of actual are zero. + template + HWY_INLINE void VerifyLanes64(D d, VecArg actual, const size_t i1, + const size_t i0, const char* filename, + const int line) { + using T = TFromD; + constexpr size_t kBlockN = 16 / sizeof(T); + const size_t N = Lanes(d); + if (N < 2) return; + auto expected = AllocateAligned(N); + for (size_t block = 0; block < N; block += kBlockN) { + expected[block + 1] = static_cast(block + i1); + expected[block + 0] = static_cast(block + i0); + } + AssertVecEqual(d, expected.get(), actual, filename, line); + } +}; + +HWY_NOINLINE void TestAllSpecialShuffles() { + const ForGEVectors<128, TestSpecialShuffle32> test32; + test32(uint32_t()); + test32(int32_t()); + test32(float()); + +#if HWY_HAVE_INTEGER64 + const ForGEVectors<128, TestSpecialShuffle64> test64; + test64(uint64_t()); + test64(int64_t()); +#endif + +#if HWY_HAVE_FLOAT64 + const ForGEVectors<128, TestSpecialShuffle64> test_d; + test_d(double()); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyBlockwiseTest); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower); +#if HWY_TARGETS != HWY_SCALAR +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper); +#endif +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles); +} // namespace hwy + +#endif diff --git a/hwy/tests/combine_test.cc b/hwy/tests/combine_test.cc new file mode 100644 index 0000000..b99f07a --- /dev/null +++ b/hwy/tests/combine_test.cc @@ -0,0 +1,273 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // memcpy + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/combine_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestLowerHalf { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Half d2; + + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + auto lanes2 = AllocateAligned(N); + std::fill(lanes.get(), lanes.get() + N, T(0)); + std::fill(lanes2.get(), lanes2.get() + N, T(0)); + const auto v = Iota(d, 1); + Store(LowerHalf(d2, v), d2, lanes.get()); + Store(LowerHalf(v), d2, lanes2.get()); // optionally without D + size_t i = 0; + for (; i < Lanes(d2); ++i) { + HWY_ASSERT_EQ(T(1 + i), lanes[i]); + HWY_ASSERT_EQ(T(1 + i), lanes2[i]); + } + // Other half remains unchanged + for (; i < N; ++i) { + HWY_ASSERT_EQ(T(0), lanes[i]); + HWY_ASSERT_EQ(T(0), lanes2[i]); + } + } +}; + +struct TestLowerQuarter { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Half d2; + const Half d4; + + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + auto lanes2 = AllocateAligned(N); + std::fill(lanes.get(), lanes.get() + N, T(0)); + std::fill(lanes2.get(), lanes2.get() + N, T(0)); + const auto v = Iota(d, 1); + const auto lo = LowerHalf(d4, LowerHalf(d2, v)); + const auto lo2 = LowerHalf(LowerHalf(v)); // optionally without D + Store(lo, d4, lanes.get()); + Store(lo2, d4, lanes2.get()); + size_t i = 0; + for (; i < Lanes(d4); ++i) { + HWY_ASSERT_EQ(T(i + 1), lanes[i]); + HWY_ASSERT_EQ(T(i + 1), lanes2[i]); + } + // Upper 3/4 remain unchanged + for (; i < N; ++i) { + HWY_ASSERT_EQ(T(0), lanes[i]); + HWY_ASSERT_EQ(T(0), lanes2[i]); + } + } +}; + +HWY_NOINLINE void TestAllLowerHalf() { + ForAllTypes(ForHalfVectors()); + + // The minimum vector size is 128 bits, so there's no guarantee we can have + // quarters of 64-bit lanes, hence test 'all' other types. + ForHalfVectors test_quarter; + ForUI8(test_quarter); + ForUI16(test_quarter); // exclude float16_t - cannot compare + ForUIF32(test_quarter); +} + +struct TestUpperHalf { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define UpperHalf. +#if HWY_TARGET != HWY_SCALAR + const Half d2; + const size_t N2 = Lanes(d2); + HWY_ASSERT(N2 * 2 == Lanes(d)); + auto expected = AllocateAligned(N2); + size_t i = 0; + for (; i < N2; ++i) { + expected[i] = static_cast(N2 + 1 + i); + } + HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1))); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllUpperHalf() { + ForAllTypes(ForHalfVectors()); +} + +struct TestZeroExtendVector { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Twice d2; + + const auto v = Iota(d, 1); + const size_t N = Lanes(d); + const size_t N2 = Lanes(d2); + // If equal, then N was already MaxLanes(d) and it's not clear what + // Combine or ZeroExtendVector should return. + if (N2 == N) return; + HWY_ASSERT(N2 == 2 * N); + auto lanes = AllocateAligned(N2); + Store(v, d, &lanes[0]); + Store(v, d, &lanes[N]); + + const auto ext = ZeroExtendVector(d2, v); + Store(ext, d2, lanes.get()); + + // Lower half is unchanged + HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0])); + // Upper half is zero + HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N])); + } +}; + +HWY_NOINLINE void TestAllZeroExtendVector() { + ForAllTypes(ForExtendableVectors()); +} + +struct TestCombine { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Twice d2; + const size_t N2 = Lanes(d2); + auto lanes = AllocateAligned(N2); + + const auto lo = Iota(d, 1); + const auto hi = Iota(d, static_cast(N2 / 2 + 1)); + const auto combined = Combine(d2, hi, lo); + Store(combined, d2, lanes.get()); + + const auto expected = Iota(d2, 1); + HWY_ASSERT_VEC_EQ(d2, expected, combined); + } +}; + +HWY_NOINLINE void TestAllCombine() { + ForAllTypes(ForExtendableVectors()); +} + +struct TestConcat { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + if (N == 1) return; + const size_t half_bytes = N * sizeof(T) / 2; + + auto hi = AllocateAligned(N); + auto lo = AllocateAligned(N); + auto expected = AllocateAligned(N); + RandomState rng; + for (size_t rep = 0; rep < 10; ++rep) { + for (size_t i = 0; i < N; ++i) { + hi[i] = static_cast(Random64(&rng) & 0xFF); + lo[i] = static_cast(Random64(&rng) & 0xFF); + } + + { + memcpy(&expected[N / 2], &hi[N / 2], half_bytes); + memcpy(&expected[0], &lo[0], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(d, vhi, vlo)); + } + + { + memcpy(&expected[N / 2], &hi[N / 2], half_bytes); + memcpy(&expected[0], &lo[N / 2], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperUpper(d, vhi, vlo)); + } + + { + memcpy(&expected[N / 2], &hi[0], half_bytes); + memcpy(&expected[0], &lo[N / 2], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerUpper(d, vhi, vlo)); + } + + { + memcpy(&expected[N / 2], &hi[0], half_bytes); + memcpy(&expected[0], &lo[0], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerLower(d, vhi, vlo)); + } + } + } +}; + +HWY_NOINLINE void TestAllConcat() { + ForAllTypes(ForShrinkableVectors()); +} + +struct TestConcatOddEven { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + const size_t N = Lanes(d); + const auto hi = Iota(d, static_cast(N)); + const auto lo = Iota(d, 0); + const auto even = Add(Iota(d, 0), Iota(d, 0)); + const auto odd = Add(even, Set(d, 1)); + HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo)); + HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo)); + + // This test catches inadvertent saturation. + const auto min = Set(d, LowestValue()); + const auto max = Set(d, HighestValue()); + HWY_ASSERT_VEC_EQ(d, max, ConcatOdd(d, max, max)); + HWY_ASSERT_VEC_EQ(d, max, ConcatEven(d, max, max)); + HWY_ASSERT_VEC_EQ(d, min, ConcatOdd(d, min, min)); + HWY_ASSERT_VEC_EQ(d, min, ConcatEven(d, min, min)); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllConcatOddEven() { + ForAllTypes(ForShrinkableVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCombineTest); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcat); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcatOddEven); +} // namespace hwy + +#endif // HWY_ONCE diff --git a/hwy/tests/compare_test.cc b/hwy/tests/compare_test.cc new file mode 100644 index 0000000..a96e29f --- /dev/null +++ b/hwy/tests/compare_test.cc @@ -0,0 +1,509 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // memset + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/compare_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// All types. +struct TestEquality { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v2 = Iota(d, 2); + const auto v2b = Iota(d, 2); + const auto v3 = Iota(d, 3); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v3, v2)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v2, v3)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v3, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2b)); + } +}; + +HWY_NOINLINE void TestAllEquality() { + ForAllTypes(ForPartialVectors()); +} + +// a > b should be true, verify that for Gt/Lt and with swapped args. +template +void EnsureGreater(D d, TFromD a, TFromD b, const char* file, int line) { + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + const auto va = Set(d, a); + const auto vb = Set(d, b); + AssertMaskEqual(d, mask_true, Gt(va, vb), file, line); + AssertMaskEqual(d, mask_false, Lt(va, vb), file, line); + + // Swapped order + AssertMaskEqual(d, mask_false, Gt(vb, va), file, line); + AssertMaskEqual(d, mask_true, Lt(vb, va), file, line); + + // Also ensure irreflexive + AssertMaskEqual(d, mask_false, Gt(va, va), file, line); + AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line); + AssertMaskEqual(d, mask_false, Lt(va, va), file, line); + AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line); +} + +#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__) + +struct TestStrictUnsigned { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const T max = LimitsMax(); + const auto v0 = Zero(d); + const auto v2 = And(Iota(d, T(2)), Set(d, 255)); // 0..255 + + const auto mask_false = MaskFalse(d); + + // Individual values of interest + HWY_ENSURE_GREATER(d, 2, 1); + HWY_ENSURE_GREATER(d, 1, 0); + HWY_ENSURE_GREATER(d, 128, 127); + HWY_ENSURE_GREATER(d, max, max / 2); + HWY_ENSURE_GREATER(d, max, 1); + HWY_ENSURE_GREATER(d, max, 0); + + // Also use Iota to ensure lanes are independent + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2)); + } +}; + +HWY_NOINLINE void TestAllStrictUnsigned() { + ForUnsignedTypes(ForPartialVectors()); +} + +struct TestStrictInt { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const T min = LimitsMin(); + const T max = LimitsMax(); + const auto v0 = Zero(d); + const auto v2 = And(Iota(d, T(2)), Set(d, 127)); // 0..127 + const auto vn = Sub(Neg(v2), Set(d, 1)); // -1..-128 + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + // Individual values of interest + HWY_ENSURE_GREATER(d, 2, 1); + HWY_ENSURE_GREATER(d, 1, 0); + HWY_ENSURE_GREATER(d, 0, -1); + HWY_ENSURE_GREATER(d, -1, -2); + HWY_ENSURE_GREATER(d, max, max / 2); + HWY_ENSURE_GREATER(d, max, 1); + HWY_ENSURE_GREATER(d, max, 0); + HWY_ENSURE_GREATER(d, max, -1); + HWY_ENSURE_GREATER(d, max, min); + HWY_ENSURE_GREATER(d, 0, min); + HWY_ENSURE_GREATER(d, min / 2, min); + + // Also use Iota to ensure lanes are independent + HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn)); + } +}; + +// S-SSE3 bug (#795): same upper, differing MSB in lower +struct TestStrictInt64 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto m0 = MaskFalse(d); + const auto m1 = MaskTrue(d); + HWY_ASSERT_MASK_EQ(d, m0, Lt(Set(d, 0x380000000LL), Set(d, 0x300000001LL))); + HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000000LL))); + HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000001LL))); + } +}; + +HWY_NOINLINE void TestAllStrictInt() { + ForSignedTypes(ForPartialVectors()); + ForPartialVectors()(int64_t()); +} + +struct TestStrictFloat { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const T huge_neg = T(-1E35); + const T huge_pos = T(1E36); + const auto v0 = Zero(d); + const auto v2 = Iota(d, T(2)); + const auto vn = Neg(v2); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + // Individual values of interest + HWY_ENSURE_GREATER(d, 2, 1); + HWY_ENSURE_GREATER(d, 1, 0); + HWY_ENSURE_GREATER(d, 0, -1); + HWY_ENSURE_GREATER(d, -1, -2); + HWY_ENSURE_GREATER(d, huge_pos, 1); + HWY_ENSURE_GREATER(d, huge_pos, 0); + HWY_ENSURE_GREATER(d, huge_pos, -1); + HWY_ENSURE_GREATER(d, huge_pos, huge_neg); + HWY_ENSURE_GREATER(d, 0, huge_neg); + + // Also use Iota to ensure lanes are independent + HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn)); + } +}; + +HWY_NOINLINE void TestAllStrictFloat() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestWeakFloat { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v2 = Iota(d, T(2)); + const auto vn = Iota(d, -T(Lanes(d))); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2)); + } +}; + +HWY_NOINLINE void TestAllWeakFloat() { + ForFloatTypes(ForPartialVectors()); +} + +template +static HWY_NOINLINE Vec Make128(D d, uint64_t hi, uint64_t lo) { + alignas(16) uint64_t in[2]; + in[0] = lo; + in[1] = hi; + return LoadDup128(d, in); +} + +struct TestLt128 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax(), LimitsMax()); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); } + +struct TestLt128Upper { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax(), LimitsMax()); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllLt128Upper() { + ForGEVectors<128, TestLt128Upper>()(uint64_t()); +} + +struct TestEq128 { // Also Ne128 + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax(), LimitsMax()); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); } + +struct TestEq128Upper { // Also Ne128Upper + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax(), LimitsMax()); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllEq128Upper() { + ForGEVectors<128, TestEq128Upper>()(uint64_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCompareTest); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128Upper); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128Upper); +} // namespace hwy + +#endif diff --git a/hwy/tests/compress_test.cc b/hwy/tests/compress_test.cc new file mode 100644 index 0000000..e2d0ef0 --- /dev/null +++ b/hwy/tests/compress_test.cc @@ -0,0 +1,757 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // memset + +#include // IWYU pragma: keep + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/compress_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Regenerate tables used in the implementation, instead of testing. +#define HWY_PRINT_TABLES 0 + +#if !HWY_PRINT_TABLES || HWY_IDE + +template , typename TI = TFromD> +void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos, + size_t num_to_check, const AlignedFreeUniquePtr& in, + const AlignedFreeUniquePtr& mask_lanes, + const AlignedFreeUniquePtr& expected, const T* actual_u, + int line) { + if (expected_pos != actual_pos) { + hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n", + TypeName(T(), Lanes(d)).c_str(), static_cast(expected_pos), + static_cast(actual_pos)); + } + // Modified from AssertVecEqual - we may not be checking all lanes. + for (size_t i = 0; i < num_to_check; ++i) { + if (!IsEqual(expected[i], actual_u[i])) { + const size_t N = Lanes(d); + fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n", + static_cast(i), static_cast(num_to_check), line); + Print(di, "mask", Load(di, mask_lanes.get()), 0, N); + Print(d, "in", Load(d, in.get()), 0, N); + Print(d, "expect", Load(d, expected.get()), 0, N); + Print(d, "actual", Load(d, actual_u), 0, N); + HWY_ASSERT(false); + } + } +} + +struct TestCompress { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + + const T zero{0}; + + for (int frac : {0, 2, 3}) { + // For CompressStore + const size_t misalign = static_cast(frac) * N / 4; + + auto in_lanes = AllocateAligned(N); + auto mask_lanes = AllocateAligned(N); + auto expected = AllocateAligned(N); + auto actual_a = AllocateAligned(misalign + N); + T* actual_u = actual_a.get() + misalign; + + const size_t bits_size = RoundUpTo((N + 7) / 8, 8); + auto bits = AllocateAligned(bits_size); + memset(bits.get(), 0, bits_size); // for MSAN + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + size_t expected_pos = 0; + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = Random32(&rng); + in_lanes[i] = T(); // cannot initialize float16_t directly. + CopyBytes(&bits, &in_lanes[i]); // not same size + mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + if (mask_lanes[i] > 0) { + expected[expected_pos++] = in_lanes[i]; + } + } + size_t num_to_check; + if (CompressIsPartition::value) { + // For non-native Compress, also check that mask=false lanes were + // moved to the back of the vector (highest indices). + size_t extra = expected_pos; + for (size_t i = 0; i < N; ++i) { + if (mask_lanes[i] == 0) { + expected[extra++] = in_lanes[i]; + } + } + HWY_ASSERT(extra == N); + num_to_check = N; + } else { + // For native Compress, only the mask=true lanes are defined. + num_to_check = expected_pos; + } + + const auto in = Load(d, in_lanes.get()); + const auto mask = + RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di))); + StoreMaskBits(d, mask, bits.get()); + + // Compress + memset(actual_u, 0, N * sizeof(T)); + StoreU(Compress(in, mask), d, actual_u); + CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes, + mask_lanes, expected, actual_u, __LINE__); + + // CompressNot + memset(actual_u, 0, N * sizeof(T)); + StoreU(CompressNot(in, Not(mask)), d, actual_u); + CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes, + mask_lanes, expected, actual_u, __LINE__); + + // CompressStore + memset(actual_u, 0, N * sizeof(T)); + const size_t size1 = CompressStore(in, mask, d, actual_u); + // expected_pos instead of num_to_check because this op is not + // affected by CompressIsPartition. + CheckStored(d, di, expected_pos, size1, expected_pos, in_lanes, + mask_lanes, expected, actual_u, __LINE__); + + // CompressBlendedStore + memset(actual_u, 0, N * sizeof(T)); + const size_t size2 = CompressBlendedStore(in, mask, d, actual_u); + // expected_pos instead of num_to_check because this op only writes + // the mask=true lanes. + CheckStored(d, di, expected_pos, size2, expected_pos, in_lanes, + mask_lanes, expected, actual_u, __LINE__); + // Subsequent lanes are untouched. + for (size_t i = size2; i < N; ++i) { + HWY_ASSERT_EQ(zero, actual_u[i]); + } + + // CompressBits + memset(actual_u, 0, N * sizeof(T)); + StoreU(CompressBits(in, bits.get()), d, actual_u); + CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes, + mask_lanes, expected, actual_u, __LINE__); + + // CompressBitsStore + memset(actual_u, 0, N * sizeof(T)); + const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u); + // expected_pos instead of num_to_check because this op is not + // affected by CompressIsPartition. + CheckStored(d, di, expected_pos, size3, expected_pos, in_lanes, + mask_lanes, expected, actual_u, __LINE__); + } // rep + } // frac + } // operator() +}; + +HWY_NOINLINE void TestAllCompress() { + ForUIF163264(ForPartialVectors()); +} + +struct TestCompressBlocks { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET == HWY_SCALAR + (void)d; +#else + static_assert(sizeof(T) == 8 && !IsSigned(), "Should be u64"); + RandomState rng; + + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + + auto in_lanes = AllocateAligned(N); + auto mask_lanes = AllocateAligned(N); + auto expected = AllocateAligned(N); + auto actual = AllocateAligned(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + size_t expected_pos = 0; + for (size_t i = 0; i < N; i += 2) { + const uint64_t bits = Random32(&rng); + in_lanes[i + 1] = in_lanes[i] = T(); // cannot set float16_t directly. + CopyBytes(&bits, &in_lanes[i]); // not same size + CopyBytes(&bits, &in_lanes[i + 1]); // not same size + mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0}; + if (mask_lanes[i] > 0) { + expected[expected_pos++] = in_lanes[i]; + expected[expected_pos++] = in_lanes[i + 1]; + } + } + size_t num_to_check; + if (CompressIsPartition::value) { + // For non-native Compress, also check that mask=false lanes were + // moved to the back of the vector (highest indices). + size_t extra = expected_pos; + for (size_t i = 0; i < N; ++i) { + if (mask_lanes[i] == 0) { + expected[extra++] = in_lanes[i]; + } + } + HWY_ASSERT(extra == N); + num_to_check = N; + } else { + // For native Compress, only the mask=true lanes are defined. + num_to_check = expected_pos; + } + + const auto in = Load(d, in_lanes.get()); + const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di))); + + // CompressBlocksNot + memset(actual.get(), 0, N * sizeof(T)); + StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get()); + CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes, + mask_lanes, expected, actual.get(), __LINE__); + } // rep +#endif // HWY_TARGET == HWY_SCALAR + } // operator() +}; + +HWY_NOINLINE void TestAllCompressBlocks() { + ForGE128Vectors()(uint64_t()); +} + +#endif // !HWY_PRINT_TABLES + +#if HWY_PRINT_TABLES || HWY_IDE +namespace detail { // for code folding + +void PrintCompress16x8Tables() { + printf("======================================= 16x8\n"); + constexpr size_t N = 8; // 128-bit SIMD + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Doubled (for converting lane to byte indices) + for (size_t i = 0; i < N; ++i) { + printf("%d,", 2 * indices[i]); + } + printf(code & 1 ? "//\n" : "/**/"); + } + printf("\n"); +} + +void PrintCompressNot16x8Tables() { + printf("======================================= Not 16x8\n"); + constexpr size_t N = 8; // 128-bit SIMD + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Doubled (for converting lane to byte indices) + for (size_t i = 0; i < N; ++i) { + printf("%d,", 2 * indices[i]); + } + printf(not_code & 1 ? "//\n" : "/**/"); + } + printf("\n"); +} + +// Compressed to nibbles, unpacked via variable right shift. Also includes +// FirstN bits in the nibble MSB. +void PrintCompress32x8Tables() { + printf("======================================= 32/64x8\n"); + constexpr size_t N = 8; // AVX2 or 64-bit AVX3 + for (uint64_t code = 0; code < (1ull << N); ++code) { + const size_t count = PopCount(code); + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + if (i < count) { + indices[i] |= N; + HWY_ASSERT(indices[i] < 0x10); + } + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast(packed)); + } + printf("\n"); +} + +void PrintCompressNot32x8Tables() { + printf("======================================= Not 32/64x8\n"); + constexpr size_t N = 8; // AVX2 or 64-bit AVX3 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + const size_t count = PopCount(code); + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + if (i < count) { + indices[i] |= N; + HWY_ASSERT(indices[i] < 0x10); + } + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast(packed)); + } + printf("\n"); +} + +// Compressed to nibbles (for AVX3 64x4) +void PrintCompress64x4NibbleTables() { + printf("======================================= 64x4Nibble\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast(packed)); + } + printf("\n"); +} + +void PrintCompressNot64x4NibbleTables() { + printf("======================================= Not 64x4Nibble\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast(packed)); + } + printf("\n"); +} + +void PrintCompress64x4Tables() { + printf("======================================= 64x4 uncompressed\n"); + constexpr size_t N = 4; // SVE_256 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + printf("%d,", static_cast(indices[i])); + } + } + printf("\n"); +} + +void PrintCompressNot64x4Tables() { + printf("======================================= Not 64x4 uncompressed\n"); + constexpr size_t N = 4; // SVE_256 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + printf("%d,", static_cast(indices[i])); + } + } + printf("\n"); +} + +// Same as above, but prints pairs of u32 indices (for AVX2). Also includes +// FirstN bits in the nibble MSB. +void PrintCompress64x4PairTables() { + printf("======================================= 64x4 u32 index\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t code = 0; code < (1ull << N); ++code) { + const size_t count = PopCount(code); + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + const int first_n_bit = i < count ? 8 : 0; + const int low = static_cast(2 * indices[i]) + first_n_bit; + HWY_ASSERT(low < 0x10); + printf("%d, %d, ", low, low + 1); + } + } + printf("\n"); +} + +void PrintCompressNot64x4PairTables() { + printf("======================================= Not 64x4 u32 index\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + const size_t count = PopCount(code); + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + const int first_n_bit = i < count ? 8 : 0; + const int low = static_cast(2 * indices[i]) + first_n_bit; + HWY_ASSERT(low < 0x10); + printf("%d, %d, ", low, low + 1); + } + } + printf("\n"); +} + +// 4-tuple of byte indices +void PrintCompress32x4Tables() { + printf("======================================= 32x4\n"); + using T = uint32_t; + constexpr size_t N = 4; // SSE4 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +void PrintCompressNot32x4Tables() { + printf("======================================= Not 32x4\n"); + using T = uint32_t; + constexpr size_t N = 4; // SSE4 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +// 8-tuple of byte indices +void PrintCompress64x2Tables() { + printf("======================================= 64x2\n"); + using T = uint64_t; + constexpr size_t N = 2; // SSE4 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +void PrintCompressNot64x2Tables() { + printf("======================================= Not 64x2\n"); + using T = uint64_t; + constexpr size_t N = 2; // SSE4 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +} // namespace detail + +HWY_NOINLINE void PrintTables() { + // Only print once. +#if HWY_TARGET == HWY_STATIC_TARGET + detail::PrintCompress32x8Tables(); + detail::PrintCompressNot32x8Tables(); + detail::PrintCompress64x4NibbleTables(); + detail::PrintCompressNot64x4NibbleTables(); + detail::PrintCompress64x4Tables(); + detail::PrintCompressNot64x4Tables(); + detail::PrintCompress32x4Tables(); + detail::PrintCompressNot32x4Tables(); + detail::PrintCompress64x2Tables(); + detail::PrintCompressNot64x2Tables(); + detail::PrintCompress64x4PairTables(); + detail::PrintCompressNot64x4PairTables(); + detail::PrintCompress16x8Tables(); + detail::PrintCompressNot16x8Tables(); +#endif +} + +#endif // HWY_PRINT_TABLES + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCompressTest); +#if HWY_PRINT_TABLES +// Only print instead of running tests; this will be visible in the log. +HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables); +#else +HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress); +HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks); +#endif +} // namespace hwy + +#endif diff --git a/hwy/tests/convert_test.cc b/hwy/tests/convert_test.cc new file mode 100644 index 0000000..a7aea5f --- /dev/null +++ b/hwy/tests/convert_test.cc @@ -0,0 +1,643 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include // std::isfinite + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/convert_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Cast and ensure bytes are the same. Called directly from TestAllBitCast or +// via TestBitCastFrom. +template +struct TestBitCast { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Repartition dto; + const size_t N = Lanes(d); + const size_t Nto = Lanes(dto); + if (N == 0 || Nto == 0) return; + HWY_ASSERT_EQ(N * sizeof(T), Nto * sizeof(ToT)); + const auto vf = Iota(d, 1); + const auto vt = BitCast(dto, vf); + // Must return the same bits + auto from_lanes = AllocateAligned(Lanes(d)); + auto to_lanes = AllocateAligned(Lanes(dto)); + Store(vf, d, from_lanes.get()); + Store(vt, dto, to_lanes.get()); + HWY_ASSERT( + BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T))); + } +}; + +// From D to all types. +struct TestBitCastFrom { + template + HWY_NOINLINE void operator()(T t, D d) { + TestBitCast()(t, d); + TestBitCast()(t, d); + TestBitCast()(t, d); +#if HWY_HAVE_INTEGER64 + TestBitCast()(t, d); +#endif + TestBitCast()(t, d); + TestBitCast()(t, d); + TestBitCast()(t, d); +#if HWY_HAVE_INTEGER64 + TestBitCast()(t, d); +#endif + TestBitCast()(t, d); +#if HWY_HAVE_FLOAT64 + TestBitCast()(t, d); +#endif + } +}; + +HWY_NOINLINE void TestAllBitCast() { + // For HWY_SCALAR and partial vectors, we can only cast to same-sized types: + // the former can't partition its single lane, and the latter can be smaller + // than a destination type. + const ForPartialVectors> to_u8; + to_u8(uint8_t()); + to_u8(int8_t()); + + const ForPartialVectors> to_i8; + to_i8(uint8_t()); + to_i8(int8_t()); + + const ForPartialVectors> to_u16; + to_u16(uint16_t()); + to_u16(int16_t()); + + const ForPartialVectors> to_i16; + to_i16(uint16_t()); + to_i16(int16_t()); + + const ForPartialVectors> to_u32; + to_u32(uint32_t()); + to_u32(int32_t()); + to_u32(float()); + + const ForPartialVectors> to_i32; + to_i32(uint32_t()); + to_i32(int32_t()); + to_i32(float()); + +#if HWY_HAVE_INTEGER64 + const ForPartialVectors> to_u64; + to_u64(uint64_t()); + to_u64(int64_t()); +#if HWY_HAVE_FLOAT64 + to_u64(double()); +#endif + + const ForPartialVectors> to_i64; + to_i64(uint64_t()); + to_i64(int64_t()); +#if HWY_HAVE_FLOAT64 + to_i64(double()); +#endif +#endif // HWY_HAVE_INTEGER64 + + const ForPartialVectors> to_float; + to_float(uint32_t()); + to_float(int32_t()); + to_float(float()); + +#if HWY_HAVE_FLOAT64 + const ForPartialVectors> to_double; + to_double(double()); +#if HWY_HAVE_INTEGER64 + to_double(uint64_t()); + to_double(int64_t()); +#endif // HWY_HAVE_INTEGER64 +#endif // HWY_HAVE_FLOAT64 + +#if HWY_TARGET != HWY_SCALAR + // For non-scalar vectors, we can cast all types to all. + ForAllTypes(ForGEVectors<64, TestBitCastFrom>()); +#endif +} + +template +struct TestPromoteTo { + template + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); + const Rebind to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned(N); + auto expected = AllocateAligned(N); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + expected[i] = from[i]; + } + + HWY_ASSERT_VEC_EQ(to_d, expected.get(), + PromoteTo(to_d, Load(from_d, from.get()))); + } + } +}; + +HWY_NOINLINE void TestAllPromoteTo() { + const ForPromoteVectors, 1> to_u16div2; + to_u16div2(uint8_t()); + + const ForPromoteVectors, 2> to_u32div4; + to_u32div4(uint8_t()); + + const ForPromoteVectors, 1> to_u32div2; + to_u32div2(uint16_t()); + + const ForPromoteVectors, 1> to_i16div2; + to_i16div2(uint8_t()); + to_i16div2(int8_t()); + + const ForPromoteVectors, 1> to_i32div2; + to_i32div2(uint16_t()); + to_i32div2(int16_t()); + + const ForPromoteVectors, 2> to_i32div4; + to_i32div4(uint8_t()); + to_i32div4(int8_t()); + + // Must test f16/bf16 separately because we can only load/store/convert them. + +#if HWY_HAVE_INTEGER64 + const ForPromoteVectors, 1> to_u64div2; + to_u64div2(uint32_t()); + + const ForPromoteVectors, 1> to_i64div2; + to_i64div2(int32_t()); +#endif + +#if HWY_HAVE_FLOAT64 + const ForPromoteVectors, 1> to_f64div2; + to_f64div2(int32_t()); + to_f64div2(float()); +#endif +} + +template +bool IsFinite(T t) { + return std::isfinite(t); +} +// Wrapper avoids calling std::isfinite for integer types (ambiguous). +template +bool IsFinite(T /*unused*/) { + return true; +} + +template +AlignedFreeUniquePtr F16TestCases(D d, size_t& padded) { + const float test_cases[] = { + // +/- 1 + 1.0f, -1.0f, + // +/- 0 + 0.0f, -0.0f, + // near 0 + 0.25f, -0.25f, + // +/- integer + 4.0f, -32.0f, + // positive near limit + 65472.0f, 65504.0f, + // negative near limit + -65472.0f, -65504.0f, + // positive +/- delta + 2.00390625f, 3.99609375f, + // negative +/- delta + -2.00390625f, -3.99609375f, + // No infinity/NaN - implementation-defined due to ARM. + }; + constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + HWY_ASSERT(N != 0); + padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors + auto in = AllocateAligned(padded); + auto expected = AllocateAligned(padded); + size_t i = 0; + for (; i < kNumTestCases; ++i) { + in[i] = test_cases[i]; + } + for (; i < padded; ++i) { + in[i] = 0.0f; + } + return in; +} + +struct TestF16 { + template + HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { +#if HWY_HAVE_FLOAT16 + size_t padded; + const size_t N = Lanes(d32); // same count for f16 + HWY_ASSERT(N != 0); + auto in = F16TestCases(d32, padded); + using TF16 = float16_t; + const Rebind d16; + auto temp16 = AllocateAligned(N); + + for (size_t i = 0; i < padded; i += N) { + const auto loaded = Load(d32, &in[i]); + Store(DemoteTo(d16, loaded), d16, temp16.get()); + HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get()))); + } +#else + (void)d32; +#endif + } +}; + +HWY_NOINLINE void TestAllF16() { ForDemoteVectors()(float()); } + +template +AlignedFreeUniquePtr BF16TestCases(D d, size_t& padded) { + const float test_cases[] = { + // +/- 1 + 1.0f, -1.0f, + // +/- 0 + 0.0f, -0.0f, + // near 0 + 0.25f, -0.25f, + // +/- integer + 4.0f, -32.0f, + // positive near limit + 3.389531389251535E38f, 1.99384199368e+38f, + // negative near limit + -3.389531389251535E38f, -1.99384199368e+38f, + // positive +/- delta + 2.015625f, 3.984375f, + // negative +/- delta + -2.015625f, -3.984375f, + }; + constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + HWY_ASSERT(N != 0); + padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors + auto in = AllocateAligned(padded); + auto expected = AllocateAligned(padded); + size_t i = 0; + for (; i < kNumTestCases; ++i) { + in[i] = test_cases[i]; + } + for (; i < padded; ++i) { + in[i] = 0.0f; + } + return in; +} + +struct TestBF16 { + template + HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { +#if !defined(HWY_EMULATE_SVE) + size_t padded; + auto in = BF16TestCases(d32, padded); + using TBF16 = bfloat16_t; +#if HWY_TARGET == HWY_SCALAR + const Rebind dbf16; // avoid 4/2 = 2 lanes +#else + const Repartition dbf16; +#endif + const Half dbf16_half; + const size_t N = Lanes(d32); + HWY_ASSERT(Lanes(dbf16_half) <= N); + auto temp16 = AllocateAligned(N); + + for (size_t i = 0; i < padded; i += N) { + const auto loaded = Load(d32, &in[i]); + const auto v16 = DemoteTo(dbf16_half, loaded); + Store(v16, dbf16_half, temp16.get()); + const auto v16_loaded = Load(dbf16_half, temp16.get()); + HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, v16_loaded)); + } +#else + (void)d32; +#endif + } +}; + +HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors()(float()); } + +struct TestConvertU8 { + template + HWY_NOINLINE void operator()(T /*unused*/, const D du32) { + const Rebind du8; + const auto wrap = Set(du32, 0xFF); + HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap))); + HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F), + U8FromU32(And(Iota(du32, 0x7F), wrap))); + } +}; + +HWY_NOINLINE void TestAllConvertU8() { + ForDemoteVectors()(uint32_t()); +} + +template +constexpr bool IsSupportedTruncation() { + return (sizeof(To) < sizeof(From)) && + (Pow2(Rebind()) + 3 >= static_cast(CeilLog2(sizeof(To)))); +} + +struct TestTruncateTo { + template ()>* = nullptr> + HWY_NOINLINE void testTo(From, To, const D) { + // do nothing + } + + template ()>* = nullptr> + HWY_NOINLINE void testTo(From, To, const D d) { + constexpr uint32_t base = 0xFA578D00; + const Rebind dTo; + const auto src = Iota(d, static_cast(base)); + const auto expected = Iota(dTo, static_cast(base)); + const VFromD actual = TruncateTo(dTo, src); + HWY_ASSERT_VEC_EQ(dTo, expected, actual); + } + + template + HWY_NOINLINE void operator()(T from, const D d) { + testTo(from, uint8_t(), d); + testTo(from, uint16_t(), d); + testTo(from, uint32_t(), d); + } +}; + +HWY_NOINLINE void TestAllTruncate() { + ForUnsignedTypes(ForPartialVectors()); +} + +// Separate function to attempt to work around a compiler bug on ARM: when this +// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input. +struct TestIntFromFloatHuge { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + // The ARMv7 manual says that float->int saturates, i.e. chooses the + // nearest representable value. This works correctly on armhf with GCC, but + // not with clang. For reasons unknown, MSVC also runs into an out-of-memory + // error here. +#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC + (void)df; +#else + using TI = MakeSigned; + const Rebind di; + + // Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing + // the expected lvalue also seems to prevent the issue. + const size_t N = Lanes(df); + auto expected = AllocateAligned(N); + + // Huge positive + Store(Set(di, LimitsMax()), di, expected.get()); + HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20)))); + + // Huge negative + Store(Set(di, LimitsMin()), di, expected.get()); + HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20)))); +#endif + } +}; + +class TestIntFromFloat { + template + static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) { + using TI = MakeSigned; + const Rebind di; + constexpr size_t kBits = sizeof(TF) * 8; + + // Powers of two, plus offsets to set some mantissa bits. + const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)}; + for (int sign = 0; sign < 2; ++sign) { + for (size_t shift = 0; shift < kBits - 1; ++shift) { + for (int64_t ofs : ofs_table) { + const int64_t mag = (int64_t{1} << shift) + ofs; + const int64_t val = sign ? mag : -mag; + HWY_ASSERT_VEC_EQ(di, Set(di, static_cast(val)), + ConvertTo(di, Set(df, static_cast(val)))); + } + } + } + } + + template + static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) { + using TI = MakeSigned; + const Rebind di; + const size_t N = Lanes(df); + + // TF does not have enough precision to represent TI. + const double min = static_cast(LimitsMin()); + const double max = static_cast(LimitsMax()); + + // Also check random values. + auto from = AllocateAligned(N); + auto expected = AllocateAligned(N); + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + do { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + } while (!std::isfinite(from[i])); + if (from[i] >= max) { + expected[i] = LimitsMax(); + } else if (from[i] <= min) { + expected[i] = LimitsMin(); + } else { + expected[i] = static_cast(from[i]); + } + } + + HWY_ASSERT_VEC_EQ(di, expected.get(), + ConvertTo(di, Load(df, from.get()))); + } + } + + public: + template + HWY_NOINLINE void operator()(TF tf, const DF df) { + using TI = MakeSigned; + const Rebind di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N)))); + + // Above positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001)))); + + // Below positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999)))); + + const TF eps = static_cast(0.0001); + // Above negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), + ConvertTo(di, Iota(df, -TF(N + 1) + eps))); + + // Below negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)), + ConvertTo(di, Iota(df, -TF(N + 1) - eps))); + + TestPowers(tf, df); + TestRandom(tf, df); + } +}; + +HWY_NOINLINE void TestAllIntFromFloat() { + ForFloatTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); +} + +struct TestFloatFromInt { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = MakeSigned; + const RebindToSigned di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N)))); + + // Max positive + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax())), + ConvertTo(df, Set(di, LimitsMax()))); + + // Min negative + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin())), + ConvertTo(df, Set(di, LimitsMin()))); + } +}; + +HWY_NOINLINE void TestAllFloatFromInt() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestFloatFromUint { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TU = MakeUnsigned; + const RebindToUnsigned du; + + // Integer positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4)))); + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)), + ConvertTo(df, Iota(du, 65535))); // 2^16-1 + if (sizeof(TF) > 4) { + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)), + ConvertTo(df, Iota(du, 4294967295ULL))); // 2^32-1 + } + + // Max positive + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax())), + ConvertTo(df, Set(du, LimitsMax()))); + + // Zero + HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du))); + } +}; + +HWY_NOINLINE void TestAllFloatFromUint() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestI32F64 { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = int32_t; + const Rebind di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N)))); + + // Above positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2)))); + + // Below positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4)))); + + // Above negative + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4)))); + + // Below negative + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2)))); + + // Max positive int + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax())), + PromoteTo(df, Set(di, LimitsMax()))); + + // Min negative int + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin())), + PromoteTo(df, Set(di, LimitsMin()))); + } +}; + +HWY_NOINLINE void TestAllI32F64() { +#if HWY_HAVE_FLOAT64 + ForDemoteVectors()(double()); +#endif +} + + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyConvertTest); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64); +} // namespace hwy + +#endif diff --git a/hwy/tests/crypto_test.cc b/hwy/tests/crypto_test.cc new file mode 100644 index 0000000..b7dfb19 --- /dev/null +++ b/hwy/tests/crypto_test.cc @@ -0,0 +1,553 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // memcpy + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/crypto_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +#define HWY_PRINT_CLMUL_GOLDEN 0 + +#if HWY_TARGET != HWY_SCALAR + +class TestAES { + template + HWY_NOINLINE void TestSBox(T /*unused*/, D d) { + // The generic implementation of the S-box is difficult to verify by + // inspection, so we add a white-box test that verifies it using enumeration + // (outputs for 0..255 vs. https://en.wikipedia.org/wiki/Rijndael_S-box). + const uint8_t sbox[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, + 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, + 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, + 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, + 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, + 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, + 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, + 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, + 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, + 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, + 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, + 0xb0, 0x54, 0xbb, 0x16}; + + // Ensure it's safe to load an entire vector by padding. + const size_t N = Lanes(d); + const size_t padded = RoundUpTo(256, N); + auto expected = AllocateAligned(padded); + // Must wrap around to match the input (Iota). + for (size_t pos = 0; pos < padded;) { + const size_t remaining = HWY_MIN(padded - pos, size_t(256)); + memcpy(expected.get() + pos, sbox, remaining); + pos += remaining; + } + + for (size_t i = 0; i < 256; i += N) { + const auto in = Iota(d, static_cast(i)); + HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in)); + } + } + + public: + template + HWY_NOINLINE void operator()(T t, D d) { + // Test vector (after first KeyAddition) from + // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_Core128.pdf + alignas(16) constexpr uint8_t test_lanes[16] = { + 0x40, 0xBF, 0xAB, 0xF4, 0x06, 0xEE, 0x4D, 0x30, + 0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16}; + const auto test = LoadDup128(d, test_lanes); + + // = ShiftRow result + alignas(16) constexpr uint8_t expected_sr_lanes[16] = { + 0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF, + 0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE}; + const auto expected_sr = LoadDup128(d, expected_sr_lanes); + + // = MixColumn result + alignas(16) constexpr uint8_t expected_mc_lanes[16] = { + 0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA, + 0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59}; + const auto expected_mc = LoadDup128(d, expected_mc_lanes); + + // = KeyAddition result + alignas(16) constexpr uint8_t expected_lanes[16] = { + 0xF2, 0x65, 0xE8, 0xD5, 0x1F, 0xD2, 0x39, 0x7B, + 0xC3, 0xB9, 0x97, 0x6D, 0x90, 0x76, 0x50, 0x5C}; + const auto expected = LoadDup128(d, expected_lanes); + + alignas(16) uint8_t key_lanes[16]; + for (size_t i = 0; i < 16; ++i) { + key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i]; + } + const auto round_key = LoadDup128(d, key_lanes); + + HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d))); + HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key)); + HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d))); + HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key), + AESLastRound(test, round_key)); + + TestSBox(t, d); + } +}; +HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); } + +#else +HWY_NOINLINE void TestAllAES() {} +#endif // HWY_TARGET != HWY_SCALAR + +struct TestCLMul { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // needs 64 bit lanes and 128-bit result +#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64 + const size_t N = Lanes(d); + if (N == 1) return; + + auto in1 = AllocateAligned(N); + auto in2 = AllocateAligned(N); + + constexpr size_t kCLMulNum = 512; + // Depends on rng! + static constexpr uint64_t kCLMulLower[kCLMulNum] = { + 0x24511d4ce34d6350ULL, 0x4ca582edde1236bbULL, 0x537e58f72dac25a8ULL, + 0x4e942d5e130b9225ULL, 0x75a906c519257a68ULL, 0x1df9f85126d96c5eULL, + 0x464e7c13f4ad286aULL, 0x138535ee35dabc40ULL, 0xb2f7477b892664ecULL, + 0x01557b077167c25dULL, 0xf32682490ee49624ULL, 0x0025bac603b9e140ULL, + 0xcaa86aca3e3daf40ULL, 0x1fbcfe4af73eb6c4ULL, 0x8ee8064dd0aae5dcULL, + 0x1248cb547858c213ULL, 0x37a55ee5b10fb34cULL, 0x6eb5c97b958f86e2ULL, + 0x4b1ab3eb655ea7cdULL, 0x1d66645a85627520ULL, 0xf8728e96daa36748ULL, + 0x38621043e6ff5e3bULL, 0xd1d28b5da5ffefb4ULL, 0x0a5cd65931546df7ULL, + 0x2a0639be3d844150ULL, 0x0e2d0f18c8d6f045ULL, 0xfacc770b963326c1ULL, + 0x19611b31ca2ef141ULL, 0xabea29510dd87518ULL, 0x18a7dc4b205f2768ULL, + 0x9d3975ea5612dc86ULL, 0x06319c139e374773ULL, 0x6641710400b4c390ULL, + 0x356c29b6001c3670ULL, 0xe9e04d851e040a00ULL, 0x21febe561222d79aULL, + 0xc071eaae6e148090ULL, 0x0eed351a0af94f5bULL, 0x04324eedb3c03688ULL, + 0x39e89b136e0d6ccdULL, 0x07d0fd2777a31600ULL, 0x44b8573827209822ULL, + 0x6d690229ea177d78ULL, 0x1b9749d960ba9f18ULL, 0x190945271c0fbb94ULL, + 0x189aea0e07d2c88eULL, 0xf18eab6b65a6beb2ULL, 0x57744b21c13d0d84ULL, + 0xf63050a613e95c2eULL, 0x12cd20d25f97102fULL, 0x5a5df0678dbcba60ULL, + 0x0b08fb80948bfafcULL, 0x44cf1cbe7c6fc3c8ULL, 0x166a470ef25da288ULL, + 0x2c498a609204e48cULL, 0x261b0a22585697ecULL, 0x737750574af7dde4ULL, + 0x4079959c60b01e0cULL, 0x06ed8aac13f782d6ULL, 0x019d454ba9b5ef20ULL, + 0xea1edbf96d49e858ULL, 0x17c2f3ebde9ac469ULL, 0x5cf72706e3d6f5e4ULL, + 0x16e856aa3c841516ULL, 0x256f7e3cef83368eULL, 0x47e17c8eb2774e77ULL, + 0x9b48ac150a804821ULL, 0x584523f61ccfdf22ULL, 0xedcb6a2a75d9e7f2ULL, + 0x1fe3d1838e537aa7ULL, 0x778872e9f64549caULL, 0x2f1cea6f0d3faf92ULL, + 0x0e8c4b6a9343f326ULL, 0x01902d1ba3048954ULL, 0xc5c1fd5269e91dc0ULL, + 0x0ef8a4707817eb9cULL, 0x1f696f09a5354ca4ULL, 0x369cd9de808b818cULL, + 0xf6917d1dd43fd784ULL, 0x7f4b76bf40dc166fULL, 0x4ce67698724ace12ULL, + 0x02c3bf60e6e9cd92ULL, 0xb8229e45b21458e8ULL, 0x415efd41e91adf49ULL, + 0x5edfcd516bb921cdULL, 0x5ff2c29429fd187eULL, 0x0af666b17103b3e0ULL, + 0x1f5e4ff8f54c9a5bULL, 0x429253d8a5544ba6ULL, 0x19de2fdf9f4d9dcaULL, + 0x29bf3d37ddc19a40ULL, 0x04d4513a879552baULL, 0x5cc7476cf71ee155ULL, + 0x40011f8c238784a5ULL, 0x1a3ae50b0fd2ee2bULL, 0x7db22f432ba462baULL, + 0x417290b0bee2284aULL, 0x055a6bd5bb853db2ULL, 0xaa667daeed8c2a34ULL, + 0x0d6b316bda7f3577ULL, 0x72d35598468e3d5dULL, 0x375b594804bfd33aULL, + 0x16ed3a319b540ae8ULL, 0x093bace4b4695afdULL, 0xc7118754ec2737ceULL, + 0x0fff361f0505c81aULL, 0x996e9e7291321af0ULL, 0x496b1d9b0b89ba8cULL, + 0x65a98b2e9181da9cULL, 0x70759c8dd45575dfULL, 0x3446fe727f5e2cbbULL, + 0x1121ae609d195e74ULL, 0x5ff5d68ce8a21018ULL, 0x0e27eca3825b60d6ULL, + 0x82f628bceca3d1daULL, 0x2756a0914e344047ULL, 0xa460406c1c708d50ULL, + 0x63ce32a0c083e491ULL, 0xc883e5a685c480e0ULL, 0x602c951891e600f9ULL, + 0x02ecb2e3911ca5f8ULL, 0x0d8675f4bb70781aULL, 0x43545cc3c78ea496ULL, + 0x04164b01d6b011c2ULL, 0x3acbb323dcab2c9bULL, 0x31c5ba4e22793082ULL, + 0x5a6484af5f7c2d10ULL, 0x1a929b16194e8078ULL, 0x7a6a75d03b313924ULL, + 0x0553c73a35b1d525ULL, 0xf18628c51142be34ULL, 0x1b51cf80d7efd8f5ULL, + 0x52e0ca4df63ee258ULL, 0x0e977099160650c9ULL, 0x6be1524e92024f70ULL, + 0x0ee2152625438b9dULL, 0xfa32af436f6d8eb4ULL, 0x5ecf49c2154287e5ULL, + 0x6b72f4ae3590569dULL, 0x086c5ee6e87bfb68ULL, 0x737a4f0dc04b6187ULL, + 0x08c3439280edea41ULL, 0x9547944f01636c5cULL, 0x6acfbfc2571cd71fULL, + 0x85d7842972449637ULL, 0x252ea5e5a7fad86aULL, 0x4e41468f99ba1632ULL, + 0x095e0c3ae63b25a2ULL, 0xb005ce88fd1c9425ULL, 0x748e668abbe09f03ULL, + 0xb2cfdf466b187d18ULL, 0x60b11e633d8fe845ULL, 0x07144c4d246db604ULL, + 0x139bcaac55e96125ULL, 0x118679b5a6176327ULL, 0x1cebe90fa4d9f83fULL, + 0x22244f52f0d312acULL, 0x669d4e17c9bfb713ULL, 0x96390e0b834bb0d0ULL, + 0x01f7f0e82ba08071ULL, 0x2dffeee31ca6d284ULL, 0x1f4738745ef039feULL, + 0x4ce0dd2b603b6420ULL, 0x0035fc905910a4d5ULL, 0x07df2b533df6fb04ULL, + 0x1cee2735c9b910ddULL, 0x2bc4af565f7809eaULL, 0x2f876c1f5cb1076cULL, + 0x33e079524099d056ULL, 0x169e0405d2f9efbaULL, 0x018643ab548a358cULL, + 0x1bb6fc4331cffe92ULL, 0x05111d3a04e92faaULL, 0x23c27ecf0d638b73ULL, + 0x1b79071dc1685d68ULL, 0x0662d20aba8e1e0cULL, 0xe7f6440277144c6fULL, + 0x4ca38b64c22196c0ULL, 0x43c05f6d1936fbeeULL, 0x0654199d4d1faf0fULL, + 0xf2014054e71c2d04ULL, 0x0a103e47e96b4c84ULL, 0x7986e691dd35b040ULL, + 0x4e1ebb53c306a341ULL, 0x2775bb3d75d65ba6ULL, 0x0562ab0adeff0f15ULL, + 0x3c2746ad5eba3eacULL, 0x1facdb5765680c60ULL, 0xb802a60027d81d00ULL, + 0x1191d0f6366ae3a9ULL, 0x81a97b5ae0ea5d14ULL, 0x06bee05b6178a770ULL, + 0xc7baeb2fe1d6aeb3ULL, 0x594cb5b867d04fdfULL, 0xf515a80138a4e350ULL, + 0x646417ad8073cf38ULL, 0x4a229a43373fb8d4ULL, 0x10fa6eafff1ca453ULL, + 0x9f060700895cc731ULL, 0x00521133d11d11f4ULL, 0xb940a2bb912a7a5cULL, + 0x3fab180670ad2a3cULL, 0x45a5f0e5b6fdb95dULL, 0x27c1baad6f946b15ULL, + 0x336c6bdbe527cf58ULL, 0x3b83aa602a5baea3ULL, 0xdf749153f9bcc376ULL, + 0x1a05513a6c0b4a90ULL, 0xb81e0b570a075c47ULL, 0x471fabb40bdc27ceULL, + 0x9dec9472f6853f60ULL, 0x361f71b88114193bULL, 0x3b550a8c4feeff00ULL, + 0x0f6cde5a68bc9bc0ULL, 0x3f50121a925703e0ULL, 0x6967ff66d6d343a9ULL, + 0xff6b5bd2ce7bc3ccULL, 0x05474cea08bf6cd8ULL, 0xf76eabbfaf108eb0ULL, + 0x067529be4fc6d981ULL, 0x4d766b137cf8a988ULL, 0x2f09c7395c5cfbbdULL, + 0x388793712da06228ULL, 0x02c9ff342c8f339aULL, 0x152c734139a860a3ULL, + 0x35776eb2b270c04dULL, 0x0f8d8b41f11c4608ULL, 0x0c2071665be6b288ULL, + 0xc034e212b3f71d88ULL, 0x071d961ef3276f99ULL, 0xf98598ee75b60773ULL, + 0x062062c58c6724e4ULL, 0xd156438e2125572cULL, 0x38552d59a7f0f7c8ULL, + 0x1a402178206e413cULL, 0x1f1f996c68293b26ULL, 0x8bce3cafe1730f7eULL, + 0x2d0480a0828f6bf5ULL, 0x6c99cffa171f92f6ULL, 0x0087f842bb0ac681ULL, + 0x11d7ed06e1e7fd3eULL, 0x07cb1186f2385dc6ULL, 0x5d7763ebff1e170fULL, + 0x2dacc870231ac292ULL, 0x8486317a9ffb390cULL, 0x1c3a6dd20c959ac6ULL, + 0x90dc96e3992e06b8ULL, 0x70d60bfa33e72b67ULL, 0x70c9bddd0985ee63ULL, + 0x012c9767b3673093ULL, 0xfcd3bc5580f6a88aULL, 0x0ac80017ef6308c3ULL, + 0xdb67d709ef4bba09ULL, 0x4c63e324f0e247ccULL, 0xa15481d3fe219d60ULL, + 0x094c4279cdccb501ULL, 0x965a28c72575cb82ULL, 0x022869db25e391ebULL, + 0x37f528c146023910ULL, 0x0c1290636917deceULL, 0x9aee25e96251ca9cULL, + 0x728ac5ba853b69c2ULL, 0x9f272c93c4be20c8ULL, 0x06c1aa6319d28124ULL, + 0x4324496b1ca8a4f7ULL, 0x0096ecfe7dfc0189ULL, 0x9e06131b19ae0020ULL, + 0x15278b15902f4597ULL, 0x2a9fece8c13842d8ULL, 0x1d4e6781f0e1355eULL, + 0x6855b712d3dbf7c0ULL, 0x06a07fad99be6f46ULL, 0x3ed9d7957e4d1d7cULL, + 0x0c326f7cbc248bb2ULL, 0xe6363ad2c537cf51ULL, 0x0e12eb1c40723f13ULL, + 0xf5c6ac850afba803ULL, 0x0322a79d615fa9f0ULL, 0x6116696ed97bd5f8ULL, + 0x0d438080fbbdc9f1ULL, 0x2e4dc42c38f1e243ULL, 0x64948e9104f3a5bfULL, + 0x9fd622371bdb5f00ULL, 0x0f12bf082b2a1b6eULL, 0x4b1f8d867d78031cULL, + 0x134392ea9f5ef832ULL, 0xf3d70472321bc23eULL, 0x05fcbe5e9eea268eULL, + 0x136dede7175a22cfULL, 0x1308f8baac2cbcccULL, 0xd691026f0915eb64ULL, + 0x0e49a668345c3a38ULL, 0x24ddbbe8bc96f331ULL, 0x4d2ec9479b640578ULL, + 0x450f0697327b359cULL, 0x32b45360f4488ee0ULL, 0x4f6d9ecec46a105aULL, + 0x5500c63401ae8e80ULL, 0x47dea495cf6f98baULL, 0x13dc9a2dfca80babULL, + 0xe6f8a93f7b24ca92ULL, 0x073f57a6d900a87fULL, 0x9ddb935fd3aa695aULL, + 0x101e98d24b39e8aaULL, 0x6b8d0eb95a507ddcULL, 0x45a908b3903d209bULL, + 0x6c96a3e119e617d4ULL, 0x2442787543d3be48ULL, 0xd3bc055c7544b364ULL, + 0x7693bb042ca8653eULL, 0xb95e3a4ea5d0101eULL, 0x116f0d459bb94a73ULL, + 0x841244b72cdc5e90ULL, 0x1271acced6cb34d3ULL, 0x07d289106524d638ULL, + 0x537c9cf49c01b5bbULL, 0x8a8e16706bb7a5daULL, 0x12e50a9c499dc3a9ULL, + 0x1cade520db2ba830ULL, 0x1add52f000d7db70ULL, 0x12cf15db2ce78e30ULL, + 0x0657eaf606bfc866ULL, 0x4026816d3b05b1d0ULL, 0x1ba0ebdf90128e4aULL, + 0xdfd649375996dd6eULL, 0x0f416e906c23d9aeULL, 0x384273cad0582a24ULL, + 0x2ff27b0378a46189ULL, 0xc4ecd18a2d7a7616ULL, 0x35cef0b5cd51d640ULL, + 0x7d582363643f48b7ULL, 0x0984ad746ad0ab7cULL, 0x2990a999835f9688ULL, + 0x2d4df66a97b19e05ULL, 0x592c79720af99aa2ULL, 0x052863c230602cd3ULL, + 0x5f5e2b15edcf2840ULL, 0x01dff1b694b978b0ULL, 0x14345a48b622025eULL, + 0x028fab3b6407f715ULL, 0x3455d188e6feca50ULL, 0x1d0d40288fb1b5fdULL, + 0x4685c5c2b6a1e5aeULL, 0x3a2077b1e5fe5adeULL, 0x1bc55d611445a0d8ULL, + 0x05480ae95f3f83feULL, 0xbbb59cfcf7e17fb6ULL, 0x13f7f10970bbb990ULL, + 0x6d00ac169425a352ULL, 0x7da0db397ef2d5d3ULL, 0x5b512a247f8d2479ULL, + 0x637eaa6a977c3c32ULL, 0x3720f0ae37cba89cULL, 0x443df6e6aa7f525bULL, + 0x28664c287dcef321ULL, 0x03c267c00cf35e49ULL, 0x690185572d4021deULL, + 0x2707ff2596e321c2ULL, 0xd865f5af7722c380ULL, 0x1ea285658e33aafbULL, + 0xc257c5e88755bef4ULL, 0x066f67275cfcc31eULL, 0xb09931945cc0fed0ULL, + 0x58c1dc38d6e3a03fULL, 0xf99489678fc94ee8ULL, 0x75045bb99be5758aULL, + 0x6c163bc34b40feefULL, 0x0420063ce7bdd3b4ULL, 0xf86ef10582bf2e28ULL, + 0x162c3449ca14858cULL, 0x94106aa61dfe3280ULL, 0x4073ae7a4e7e4941ULL, + 0x32b13fd179c250b4ULL, 0x0178fbb216a7e744ULL, 0xf840ae2f1cf92669ULL, + 0x18fc709acc80243dULL, 0x20ac2ebd69f4d558ULL, 0x6e580ad9c73ad46aULL, + 0x76d2b535b541c19dULL, 0x6c7a3fb9dd0ce0afULL, 0xc3481689b9754f28ULL, + 0x156e813b6557abdbULL, 0x6ee372e31276eb10ULL, 0x19cf37c038c8d381ULL, + 0x00d4d906c9ae3072ULL, 0x09f03cbb6dfbfd40ULL, 0x461ba31c4125f3cfULL, + 0x25b29fc63ad9f05bULL, 0x6808c95c2dddede9ULL, 0x0564224337066d9bULL, + 0xc87eb5f4a4d966f2ULL, 0x66fc66e1701f5847ULL, 0xc553a3559f74da28ULL, + 0x1dfd841be574df43ULL, 0x3ee2f100c3ebc082ULL, 0x1a2c4f9517b56e89ULL, + 0x502f65c4b535c8ffULL, 0x1da5663ab6f96ec0ULL, 0xba1f80b73988152cULL, + 0x364ff12182ac8dc1ULL, 0xe3457a3c4871db31ULL, 0x6ae9cadf92fd7e84ULL, + 0x9621ba3d6ca15186ULL, 0x00ff5af878c144ceULL, 0x918464dc130101a4ULL, + 0x036511e6b187efa6ULL, 0x06667d66550ff260ULL, 0x7fd18913f9b51bc1ULL, + 0x3740e6b27af77aa8ULL, 0x1f546c2fd358ff8aULL, 0x42f1424e3115c891ULL, + 0x03767db4e3a1bb33ULL, 0xa171a1c564345060ULL, 0x0afcf632fd7b1324ULL, + 0xb59508d933ffb7d0ULL, 0x57d766c42071be83ULL, 0x659f0447546114a2ULL, + 0x4070364481c460aeULL, 0xa2b9752280644d52ULL, 0x04ab884bea5771bdULL, + 0x87cd135602a232b4ULL, 0x15e54cd9a8155313ULL, 0x1e8005efaa3e1047ULL, + 0x696b93f4ab15d39fULL, 0x0855a8e540de863aULL, 0x0bb11799e79f9426ULL, + 0xeffa61e5c1b579baULL, 0x1e060a1d11808219ULL, 0x10e219205667c599ULL, + 0x2f7b206091c49498ULL, 0xb48854c820064860ULL, 0x21c4aaa3bfbe4a38ULL, + 0x8f4a032a3fa67e9cULL, 0x3146b3823401e2acULL, 0x3afee26f19d88400ULL, + 0x167087c485791d38ULL, 0xb67a1ed945b0fb4bULL, 0x02436eb17e27f1c0ULL, + 0xe05afce2ce2d2790ULL, 0x49c536fc6224cfebULL, 0x178865b3b862b856ULL, + 0x1ce530de26acde5bULL, 0x87312c0b30a06f38ULL, 0x03e653b578558d76ULL, + 0x4d3663c21d8b3accULL, 0x038003c23626914aULL, 0xd9d5a2c052a09451ULL, + 0x39b5acfe08a49384ULL, 0x40f349956d5800e4ULL, 0x0968b6950b1bd8feULL, + 0xd60b2ca030f3779cULL, 0x7c8bc11a23ce18edULL, 0xcc23374e27630bc2ULL, + 0x2e38fc2a8bb33210ULL, 0xe421357814ee5c44ULL, 0x315fb65ea71ec671ULL, + 0xfb1b0223f70ed290ULL, 0x30556c9f983eaf07ULL, 0x8dd438c3d0cd625aULL, + 0x05a8fd0c7ffde71bULL, 0x764d1313b5aeec7aULL, 0x2036af5de9622f47ULL, + 0x508a5bfadda292feULL, 0x3f77f04ba2830e90ULL, 0x9047cd9c66ca66d2ULL, + 0x1168b5318a54eb21ULL, 0xc93462d221da2e15ULL, 0x4c2c7cc54abc066eULL, + 0x767a56fec478240eULL, 0x095de72546595bd3ULL, 0xc9da535865158558ULL, + 0x1baccf36f33e73fbULL, 0xf3d7dbe64df77f18ULL, 0x1f8ebbb7be4850b8ULL, + 0x043c5ed77bce25a1ULL, 0x07d401041b2a178aULL, 0x9181ebb8bd8d5618ULL, + 0x078b935dc3e4034aULL, 0x7b59c08954214300ULL, 0x03570dc2a4f84421ULL, + 0xdd8715b82f6b4078ULL, 0x2bb49c8bb544163bULL, 0xc9eb125564d59686ULL, + 0x5fdc7a38f80b810aULL, 0x3a4a6d8fff686544ULL, 0x28360e2418627d3aULL, + 0x60874244c95ed992ULL, 0x2115cc1dd9c34ed3ULL, 0xfaa3ef61f55e9efcULL, + 0x27ac9b1ef1adc7e6ULL, 0x95ea00478fec3f54ULL, 0x5aea808b2d99ab43ULL, + 0xc8f79e51fe43a580ULL, 0x5dbccd714236ce25ULL, 0x783fa76ed0753458ULL, + 0x48cb290f19d84655ULL, 0xc86a832f7696099aULL, 0x52f30c6fec0e71d3ULL, + 0x77d4e91e8cdeb886ULL, 0x7169a703c6a79ccdULL, 0x98208145b9596f74ULL, + 0x0945695c761c0796ULL, 0x0be897830d17bae0ULL, 0x033ad3924caeeeb4ULL, + 0xedecb6cfa2d303a8ULL, 0x3f86b074818642e7ULL, 0xeefa7c878a8b03f4ULL, + 0x093c101b80922551ULL, 0xfb3b4e6c26ac0034ULL, 0x162bf87999b94f5eULL, + 0xeaedae76e975b17cULL, 0x1852aa090effe18eULL}; + + static constexpr uint64_t kCLMulUpper[kCLMulNum] = { + 0xbb41199b1d587c69ULL, 0x514d94d55894ee29ULL, 0xebc6cd4d2efd5d16ULL, + 0x042044ad2de477fdULL, 0xb865c8b0fcdf4b15ULL, 0x0724d7e551cc40f3ULL, + 0xb15a16f39edb0bccULL, 0x37d64419ede7a171ULL, 0x2aa01bb80c753401ULL, + 0x06ff3f8a95fdaf4dULL, 0x79898cc0838546deULL, 0x776acbd1b237c60aULL, + 0x4c1753be4f4e0064ULL, 0x0ba9243601206ed3ULL, 0xd567c3b1bf3ec557ULL, + 0x043fac7bcff61fb3ULL, 0x49356232b159fb2fULL, 0x3910c82038102d4dULL, + 0x30592fef753eb300ULL, 0x7b2660e0c92a9e9aULL, 0x8246c9248d671ef0ULL, + 0x5a0dcd95147af5faULL, 0x43fde953909cc0eaULL, 0x06147b972cb96e1bULL, + 0xd84193a6b2411d80ULL, 0x00cd7711b950196fULL, 0x1088f9f4ade7fa64ULL, + 0x05a13096ec113cfbULL, 0x958d816d53b00edcULL, 0x3846154a7cdba9cbULL, + 0x8af516db6b27d1e6ULL, 0x1a1d462ab8a33b13ULL, 0x4040b0ac1b2c754cULL, + 0x05127fe9af2fe1d6ULL, 0x9f96e79374321fa6ULL, 0x06ff64a4d9c326f3ULL, + 0x28709566e158ac15ULL, 0x301701d7111ca51cULL, 0x31e0445d1b9d9544ULL, + 0x0a95aff69bf1d03eULL, 0x7c298c8414ecb879ULL, 0x00801499b4143195ULL, + 0x91521a00dd676a5cULL, 0x2777526a14c2f723ULL, 0xfa26aac6a6357dddULL, + 0x1d265889b0187a4bULL, 0xcd6e70fa8ed283e4ULL, 0x18a815aa50ea92caULL, + 0xc01e082694a263c6ULL, 0x4b40163ba53daf25ULL, 0xbc658caff6501673ULL, + 0x3ba35359586b9652ULL, 0x74f96acc97a4936cULL, 0x3989dfdb0cf1d2cfULL, + 0x358a01eaa50dda32ULL, 0x01109a5ed8f0802bULL, 0x55b84922e63c2958ULL, + 0x55b14843d87551d5ULL, 0x1db8ec61b1b578d8ULL, 0x79a2d49ef8c3658fULL, + 0xa304516816b3fbe0ULL, 0x163ecc09cc7b82f9ULL, 0xab91e8d22aabef00ULL, + 0x0ed6b09262de8354ULL, 0xcfd47d34cf73f6f2ULL, 0x7dbd1db2390bc6c3ULL, + 0x5ae789d3875e7b00ULL, 0x1d60fd0e70fe8fa4ULL, 0x690bc15d5ae4f6f5ULL, + 0x121ef5565104fb44ULL, 0x6e98e89297353b54ULL, 0x42554949249d62edULL, + 0xd6d6d16b12df78d2ULL, 0x320b33549b74975dULL, 0xd2a0618763d22e00ULL, + 0x0808deb93cba2017ULL, 0x01bd3b2302a2cc70ULL, 0x0b7b8dd4d71c8dd6ULL, + 0x34d60a3382a0756cULL, 0x40984584c8219629ULL, 0xf1152cba10093a66ULL, + 0x068001c6b2159ccbULL, 0x3d70f13c6cda0800ULL, 0x0e6b6746a322b956ULL, + 0x83a494319d8c770bULL, 0x0faecf64a8553e9aULL, 0xa34919222c39b1bcULL, + 0x0c63850d89e71c6fULL, 0x585f0bee92e53dc8ULL, 0x10f222b13b4fa5deULL, + 0x61573114f94252f2ULL, 0x09d59c311fba6c27ULL, 0x014effa7da49ed4eULL, + 0x4a400a1bc1c31d26ULL, 0xc9091c047b484972ULL, 0x3989f341ec2230ccULL, + 0xdcb03a98b3aee41eULL, 0x4a54a676a33a95e1ULL, 0xe499b7753951ef7cULL, + 0x2f43b1d1061d8b48ULL, 0xc3313bdc68ceb146ULL, 0x5159f6bc0e99227fULL, + 0x98128e6d9c05efcaULL, 0x15ea32b27f77815bULL, 0xe882c054e2654eecULL, + 0x003d2cdb8faee8c6ULL, 0xb416dd333a9fe1dfULL, 0x73f6746aefcfc98bULL, + 0x93dc114c10a38d70ULL, 0x05055941657845eaULL, 0x2ed7351347349334ULL, + 0x26fb1ee2c69ae690ULL, 0xa4575d10dc5b28e0ULL, 0x3395b11295e485ebULL, + 0xe840f198a224551cULL, 0x78e6e5a431d941d4ULL, 0xa1fee3ceab27f391ULL, + 0x07d35b3c5698d0dcULL, 0x983c67fca9174a29ULL, 0x2bb6bbae72b5144aULL, + 0xa7730b8d13ce58efULL, 0x51b5272883de1998ULL, 0xb334e128bb55e260ULL, + 0x1cacf5fbbe1b9974ULL, 0x71a9df4bb743de60ULL, 0x5176fe545c2d0d7aULL, + 0xbe592ecf1a16d672ULL, 0x27aa8a30c3efe460ULL, 0x4c78a32f47991e06ULL, + 0x383459294312f26aULL, 0x97ba789127f1490cULL, 0x51c9aa8a3abd1ef1ULL, + 0xcc7355188121e50fULL, 0x0ecb3a178ae334c1ULL, 0x84879a5e574b7160ULL, + 0x0765298f6389e8f3ULL, 0x5c6750435539bb22ULL, 0x11a05cf056c937b5ULL, + 0xb5dc2172dbfb7662ULL, 0x3ffc17915d9f40e8ULL, 0xbc7904daf3b431b0ULL, + 0x71f2088490930a7cULL, 0xa89505fd9efb53c4ULL, 0x02e194afd61c5671ULL, + 0x99a97f4abf35fcecULL, 0x26830aad30fae96fULL, 0x4b2abc16b25cf0b0ULL, + 0x07ec6fffa1cafbdbULL, 0xf38188fde97a280cULL, 0x121335701afff64dULL, + 0xea5ef38b4e672a64ULL, 0x477edbcae3eabf03ULL, 0xa32813cc0e0d244dULL, + 0x13346d2af4972eefULL, 0xcbc18357af1cfa9aULL, 0x561b630316e73fa6ULL, + 0xe9dfb53249249305ULL, 0x5d2b9dd1479312eeULL, 0x3458008119b56d04ULL, + 0x50e6790b49801385ULL, 0x5bb9febe2349492bULL, 0x0c2813954299098fULL, + 0xf747b0c890a071d5ULL, 0x417e8f82cc028d77ULL, 0xa134fee611d804f8ULL, + 0x24c99ee9a0408761ULL, 0x3ebb224e727137f3ULL, 0x0686022073ceb846ULL, + 0xa05e901fb82ad7daULL, 0x0ece7dc43ab470fcULL, 0x2d334ecc58f7d6a3ULL, + 0x23166fadacc54e40ULL, 0x9c3a4472f839556eULL, 0x071717ab5267a4adULL, + 0xb6600ac351ba3ea0ULL, 0x30ec748313bb63d4ULL, 0xb5374e39287b23ccULL, + 0x074d75e784238aebULL, 0x77315879243914a4ULL, 0x3bbb1971490865f1ULL, + 0xa355c21f4fbe02d3ULL, 0x0027f4bb38c8f402ULL, 0xeef8708e652bc5f0ULL, + 0x7b9aa56cf9440050ULL, 0x113ac03c16cfc924ULL, 0x395db36d3e4bef9fULL, + 0x5d826fabcaa597aeULL, 0x2a77d3c58786d7e0ULL, 0x85996859a3ba19d4ULL, + 0x01e7e3c904c2d97fULL, 0x34f90b9b98d51fd0ULL, 0x243aa97fd2e99bb7ULL, + 0x40a0cebc4f65c1e8ULL, 0x46d3922ed4a5503eULL, 0x446e7ecaf1f9c0a4ULL, + 0x49dc11558bc2e6aeULL, 0xe7a9f20881793af8ULL, 0x5771cc4bc98103f1ULL, + 0x2446ea6e718fce90ULL, 0x25d14aca7f7da198ULL, 0x4347af186f9af964ULL, + 0x10cb44fc9146363aULL, 0x8a35587afce476b4ULL, 0x575144662fee3d3aULL, + 0x69f41177a6bc7a05ULL, 0x02ff8c38d6b3c898ULL, 0x57c73589a226ca40ULL, + 0x732f6b5baae66683ULL, 0x00c008bbedd4bb34ULL, 0x7412ff09524d6cadULL, + 0xb8fd0b5ad8c145a8ULL, 0x74bd9f94b6cdc7dfULL, 0x68233b317ca6c19cULL, + 0x314b9c2c08b15c54ULL, 0x5bd1ad72072ebd08ULL, 0x6610e6a6c07030e4ULL, + 0xa4fc38e885ead7ceULL, 0x36975d1ca439e034ULL, 0xa358f0fe358ffb1aULL, + 0x38e247ad663acf7dULL, 0x77daed3643b5deb8ULL, 0x5507c2aeae1ec3d0ULL, + 0xfdec226c73acf775ULL, 0x1b87ff5f5033492dULL, 0xa832dee545d9033fULL, + 0x1cee43a61e41783bULL, 0xdff82b2e2d822f69ULL, 0x2bbc9a376cb38cf2ULL, + 0x117b1cdaf765dc02ULL, 0x26a407f5682be270ULL, 0x8eb664cf5634af28ULL, + 0x17cb4513bec68551ULL, 0xb0df6527900cbfd0ULL, 0x335a2dc79c5afdfcULL, + 0xa2f0ca4cd38dca88ULL, 0x1c370713b81a2de1ULL, 0x849d5df654d1adfcULL, + 0x2fd1f7675ae14e44ULL, 0x4ff64dfc02247f7bULL, 0x3a2bcf40e395a48dULL, + 0x436248c821b187c1ULL, 0x29f4337b1c7104c0ULL, 0xfc317c46e6630ec4ULL, + 0x2774bccc4e3264c7ULL, 0x2d03218d9d5bee23ULL, 0x36a0ed04d659058aULL, + 0x452484461573cab6ULL, 0x0708edf87ed6272bULL, 0xf07960a1587446cbULL, + 0x3660167b067d84e0ULL, 0x65990a6993ddf8c4ULL, 0x0b197cd3d0b40b3fULL, + 0x1dcec4ab619f3a05ULL, 0x722ab223a84f9182ULL, 0x0822d61a81e7c38fULL, + 0x3d22ad75da563201ULL, 0x93cef6979fd35e0fULL, 0x05c3c25ae598b14cULL, + 0x1338df97dd496377ULL, 0x15bc324dc9c20acfULL, 0x96397c6127e6e8cfULL, + 0x004d01069ef2050fULL, 0x2fcf2e27893fdcbcULL, 0x072f77c3e44f4a5cULL, + 0x5eb1d80b3fe44918ULL, 0x1f59e7c28cc21f22ULL, 0x3390ce5df055c1f8ULL, + 0x4c0ef11df92cb6bfULL, 0x50f82f9e0848c900ULL, 0x08d0fde3ffc0ae38ULL, + 0xbd8d0089a3fbfb73ULL, 0x118ba5b0f311ef59ULL, 0x9be9a8407b926a61ULL, + 0x4ea04fbb21318f63ULL, 0xa1c8e7bb07b871ffULL, 0x1253a7262d5d3b02ULL, + 0x13e997a0512e5b29ULL, 0x54318460ce9055baULL, 0x4e1d8a4db0054798ULL, + 0x0b235226e2cade32ULL, 0x2588732c1476b315ULL, 0x16a378750ba8ac68ULL, + 0xba0b116c04448731ULL, 0x4dd02bd47694c2f1ULL, 0x16d6797b218b6b25ULL, + 0x769eb3709cfbf936ULL, 0x197746a0ce396f38ULL, 0x7d17ad8465961d6eULL, + 0xfe58f4998ae19bb4ULL, 0x36df24305233ce69ULL, 0xb88a4eb008f4ee72ULL, + 0x302b2eb923334787ULL, 0x15a4e3edbe13d448ULL, 0x39a4bf64dd7730ceULL, + 0xedf25421b31090c4ULL, 0x4d547fc131be3b69ULL, 0x2b316e120ca3b90eULL, + 0x0faf2357bf18a169ULL, 0x71f34b54ee2c1d62ULL, 0x18eaf6e5c93a3824ULL, + 0x7e168ba03c1b4c18ULL, 0x1a534dd586d9e871ULL, 0xa2cccd307f5f8c38ULL, + 0x2999a6fb4dce30f6ULL, 0x8f6d3b02c1d549a6ULL, 0x5cf7f90d817aac5aULL, + 0xd2a4ceefe66c8170ULL, 0x11560edc4ca959feULL, 0x89e517e6f0dc464dULL, + 0x75bb8972dddd2085ULL, 0x13859ed1e459d65aULL, 0x057114653326fa84ULL, + 0xe2e6f465173cc86cULL, 0x0ada4076497d7de4ULL, 0xa856fa10ec6dbf8aULL, + 0x41505d9a7c25d875ULL, 0x3091b6278382eccdULL, 0x055737185b2c3f13ULL, + 0x2f4df8ecd6f9c632ULL, 0x0633e89c33552d98ULL, 0xf7673724d16db440ULL, + 0x7331bd08e636c391ULL, 0x0252f29672fee426ULL, 0x1fc384946b6b9ddeULL, + 0x03460c12c901443aULL, 0x003a0792e10abcdaULL, 0x8dbec31f624e37d0ULL, + 0x667420d5bfe4dcbeULL, 0xfbfa30e874ed7641ULL, 0x46d1ae14db7ecef6ULL, + 0x216bd7e8f5448768ULL, 0x32bcd40d3d69cc88ULL, 0x2e991dbc39b65abeULL, + 0x0e8fb123a502f553ULL, 0x3d2d486b2c7560c0ULL, 0x09aba1db3079fe03ULL, + 0xcb540c59398c9bceULL, 0x363970e5339ed600ULL, 0x2caee457c28af00eULL, + 0x005e7d7ee47f41a0ULL, 0x69fad3eb10f44100ULL, 0x048109388c75beb3ULL, + 0x253dddf96c7a6fb8ULL, 0x4c47f705b9d47d09ULL, 0x6cec894228b5e978ULL, + 0x04044bb9f8ff45c2ULL, 0x079e75704d775caeULL, 0x073bd54d2a9e2c33ULL, + 0xcec7289270a364fbULL, 0x19e7486f19cd9e4eULL, 0xb50ac15b86b76608ULL, + 0x0620cf81f165c812ULL, 0x63eaaf13be7b11d4ULL, 0x0e0cf831948248c2ULL, + 0xf0412df8f46e7957ULL, 0x671c1fe752517e3fULL, 0x8841bfb04dd3f540ULL, + 0x122de4142249f353ULL, 0x40a4959fb0e76870ULL, 0x25cfd3d4b4bbc459ULL, + 0x78a07c82930c60d0ULL, 0x12c2de24d4cbc969ULL, 0x85d44866096ad7f4ULL, + 0x1fd917ca66b2007bULL, 0x01fbbb0751764764ULL, 0x3d2a4953c6fe0fdcULL, + 0xcc1489c5737afd94ULL, 0x1817c5b6a5346f41ULL, 0xe605a6a7e9985644ULL, + 0x3c50412328ff1946ULL, 0xd8c7fd65817f1291ULL, 0x0bd66975ab66339bULL, + 0x2baf8fa1c7d10fa9ULL, 0x24abdf06ddef848dULL, 0x14df0c9b2ea4f6c2ULL, + 0x2be950edfd2cb1f7ULL, 0x21911e21094178b6ULL, 0x0fa54d518a93b379ULL, + 0xb52508e0ac01ab42ULL, 0x0e035b5fd8cb79beULL, 0x1c1c6d1a3b3c8648ULL, + 0x286037b42ea9871cULL, 0xfe67bf311e48a340ULL, 0x02324131e932a472ULL, + 0x2486dc2dd919e2deULL, 0x008aec7f1da1d2ebULL, 0x63269ba0e8d3eb3aULL, + 0x23c0f11154adb62fULL, 0xc6052393ecd4c018ULL, 0x523585b7d2f5b9fcULL, + 0xf7e6f8c1e87564c9ULL, 0x09eb9fe5dd32c1a3ULL, 0x4d4f86886e055472ULL, + 0x67ea17b58a37966bULL, 0x3d3ce8c23b1ed1a8ULL, 0x0df97c5ac48857ceULL, + 0x9b6992623759eb12ULL, 0x275aa9551ae091f2ULL, 0x08855e19ac5e62e5ULL, + 0x1155fffe0ae083ccULL, 0xbc9c78db7c570240ULL, 0x074560c447dd2418ULL, + 0x3bf78d330bcf1e70ULL, 0x49867cd4b7ed134bULL, 0x8e6eee0cb4470accULL, + 0x1dabafdf59233dd6ULL, 0xea3a50d844fc3fb8ULL, 0x4f03f4454764cb87ULL, + 0x1f2f41cc36c9e6ecULL, 0x53cba4df42963441ULL, 0x10883b70a88d91fbULL, + 0x62b1fc77d4eb9481ULL, 0x893d8f2604b362e1ULL, 0x0933b7855368b440ULL, + 0x9351b545703b2fceULL, 0x59c1d489b9bdd3b4ULL, 0xe72a9c4311417b18ULL, + 0x5355df77e88eb226ULL, 0xe802c37aa963d7e1ULL, 0x381c3747bd6c3bc3ULL, + 0x378565573444258cULL, 0x37848b1e52b43c18ULL, 0x5da2cd32bdce12b6ULL, + 0x13166c5da615f6fdULL, 0xa51ef95efcc66ac8ULL, 0x640c95e473f1e541ULL, + 0x6ec68def1f217500ULL, 0x49ce3543c76a4079ULL, 0x5fc6fd3cddc706b5ULL, + 0x05c3c0f0f6a1fb0dULL, 0xe7820c0996ad1bddULL, 0x21f0d752a088f35cULL, + 0x755405b51d6fc4a0ULL, 0x7ec7649ca4b0e351ULL, 0x3d2b6a46a251f790ULL, + 0x23e1176b19f418adULL, 0x06056575efe8ac05ULL, 0x0f75981b6966e477ULL, + 0x06e87ec41ad437e4ULL, 0x43f6c255d5e1cb84ULL, 0xe4e67d1120ceb580ULL, + 0x2cd67b9e12c26d7bULL, 0xcd00b5ff7fd187f1ULL, 0x3f6cd40accdc4106ULL, + 0x3e895c835459b330ULL, 0x0814d53a217c0850ULL, 0xc9111fe78bc3a62dULL, + 0x719967e351473204ULL, 0xe757707d24282aa4ULL, 0x7226b7f5607f98e6ULL, + 0x7b268ffae3c08d96ULL, 0x16d3917c8b86020eULL, 0x5128bca51c49ea64ULL, + 0x345ffea02bb1698dULL, 0x9460f5111fe4fbc8ULL, 0x60dd1aa5762852cbULL, + 0xbb7440ed3c81667cULL, 0x0a4b12affa7f6f5cULL, 0x95cbcb0ae03861b6ULL, + 0x07ab3b0591db6070ULL, 0xc6476a4c3de78982ULL, 0x204e82e8623ad725ULL, + 0x569a5b4e8ac2a5ccULL, 0x425a1d77d72ebae2ULL, 0xcdaad5551ab33830ULL, + 0x0b7c68fd8422939eULL, 0x46d9a01f53ec3020ULL, 0x102871edbb29e852ULL, + 0x7a8e8084039075a5ULL, 0x40eaede8615e376aULL, 0x4dc67d757a1c751fULL, + 0x1176ef33063f9145ULL, 0x4ea230285b1c8156ULL, 0x6b2aa46ce0027392ULL, + 0x32b13230fba1b068ULL, 0x0e69796851bb984fULL, 0xb749f4542db698c0ULL, + 0x19ad0241ffffd49cULL, 0x2f41e92ef6caff52ULL, 0x4d0b068576747439ULL, + 0x14d607aef7463e00ULL, 0x1443d00d85fb440eULL, 0x529b43bf68688780ULL, + 0x21133a6bc3a3e378ULL, 0x865b6436dae0e7e5ULL, 0x6b4fe83dc1d6defcULL, + 0x03a5858a0ca0be46ULL, 0x1e841b187e67f312ULL, 0x61ee22ef40a66940ULL, + 0x0494bd2e9e741ef8ULL, 0x4eb59e323010e72cULL, 0x19f2abcfb749810eULL, + 0xb30f1e4f994ef9bcULL, 0x53cf6cdd51bd2d96ULL, 0x263943036497a514ULL, + 0x0d4b52170aa2edbaULL, 0x0c4758a1c7b4f758ULL, 0x178dadb1b502b51aULL, + 0x1ddbb20a602eb57aULL, 0x1fc2e2564a9f27fdULL, 0xd5f8c50a0e3d6f90ULL, + 0x0081da3bbe72ac09ULL, 0xcf140d002ccdb200ULL, 0x0ae8389f09b017feULL, + 0x17cc9ffdc03f4440ULL, 0x04eb921d704bcdddULL, 0x139a0ce4cdc521abULL, + 0x0bfce00c145cb0f0ULL, 0x99925ff132eff707ULL, 0x063f6e5da50c3d35ULL, + 0xa0c25dea3f0e6e29ULL, 0x0c7a9048cc8e040fULL, + }; + + const size_t padded = RoundUpTo(kCLMulNum, N); + auto expected_lower = AllocateAligned(padded); + auto expected_upper = AllocateAligned(padded); + CopyBytes(kCLMulLower, expected_lower.get()); + CopyBytes(kCLMulUpper, expected_upper.get()); + const size_t padding_size = (padded - kCLMulNum) * sizeof(T); + memset(expected_lower.get() + kCLMulNum, 0, padding_size); + memset(expected_upper.get() + kCLMulNum, 0, padding_size); + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < kCLMulNum / N; ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = Random64(&rng); + in2[i] = Random64(&rng); + } + + const auto a = Load(d, in1.get()); + const auto b = Load(d, in2.get()); +#if HWY_PRINT_CLMUL_GOLDEN + Store(CLMulLower(a, b), d, expected_lower.get() + rep * N); + Store(CLMulUpper(a, b), d, expected_upper.get() + rep * N); +#else + HWY_ASSERT_VEC_EQ(d, expected_lower.get() + rep * N, CLMulLower(a, b)); + HWY_ASSERT_VEC_EQ(d, expected_upper.get() + rep * N, CLMulUpper(a, b)); +#endif + } + +#if HWY_PRINT_CLMUL_GOLDEN + // RVV lacks PRIu64, so print 32-bit halves. + for (size_t i = 0; i < kCLMulNum; ++i) { + printf("0x%08x%08xULL,", static_cast(expected_lower[i] >> 32), + static_cast(expected_lower[i] & 0xFFFFFFFFU)); + } + printf("\n"); + for (size_t i = 0; i < kCLMulNum; ++i) { + printf("0x%08x%08xULL,", static_cast(expected_upper[i] >> 32), + static_cast(expected_upper[i] & 0xFFFFFFFFU)); + } +#endif // HWY_PRINT_CLMUL_GOLDEN +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCryptoTest); +HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllAES); +HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllCLMul); +} // namespace hwy + +#endif diff --git a/hwy/tests/demote_test.cc b/hwy/tests/demote_test.cc new file mode 100644 index 0000000..4339a54 --- /dev/null +++ b/hwy/tests/demote_test.cc @@ -0,0 +1,326 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/demote_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +// Causes build timeout. +#if !HWY_IS_MSAN + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +bool IsFiniteT(T t) { + return std::isfinite(t); +} +// Wrapper avoids calling std::isfinite for integer types (ambiguous). +template +bool IsFiniteT(T /*unused*/) { + return true; +} + +template +struct TestDemoteTo { + template + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(!IsFloat(), "Use TestDemoteToFloat for float output"); + static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); + const Rebind to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned(N); + auto expected = AllocateAligned(N); + + // Narrower range in the wider type, for clamping before we cast + const T min = LimitsMin(); + const T max = LimitsMax(); + + const auto value_ok = [&](T& value) { + if (!IsFiniteT(value)) return false; + return true; + }; + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + do { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + } while (!value_ok(from[i])); + expected[i] = static_cast(HWY_MIN(HWY_MAX(min, from[i]), max)); + } + + const auto in = Load(from_d, from.get()); + HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in)); + } + } +}; + +HWY_NOINLINE void TestAllDemoteToInt() { + ForDemoteVectors>()(int16_t()); + ForDemoteVectors, 2>()(int32_t()); + + ForDemoteVectors>()(int16_t()); + ForDemoteVectors, 2>()(int32_t()); + + const ForDemoteVectors> to_u16; + to_u16(int32_t()); + + const ForDemoteVectors> to_i16; + to_i16(int32_t()); +} + +HWY_NOINLINE void TestAllDemoteToMixed() { +#if HWY_HAVE_FLOAT64 + const ForDemoteVectors> to_i32; + to_i32(double()); +#endif +} + +template +struct TestDemoteToFloat { + template + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + // For floats, we clamp differently and cannot call LimitsMin. + static_assert(IsFloat(), "Use TestDemoteTo for integer output"); + static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); + const Rebind to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned(N); + auto expected = AllocateAligned(N); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + do { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + } while (!IsFiniteT(from[i])); + const T magn = std::abs(from[i]); + const T max_abs = HighestValue(); + // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see + // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html + const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]); + expected[i] = static_cast(clipped); + } + + HWY_ASSERT_VEC_EQ(to_d, expected.get(), + DemoteTo(to_d, Load(from_d, from.get()))); + } + } +}; + +HWY_NOINLINE void TestAllDemoteToFloat() { + // Must test f16 separately because we can only load/store/convert them. + +#if HWY_HAVE_FLOAT64 + const ForDemoteVectors, 1> to_float; + to_float(double()); +#endif +} + +template +AlignedFreeUniquePtr ReorderBF16TestCases(D d, size_t& padded) { + const float test_cases[] = { + // Same as BF16TestCases: + // +/- 1 + 1.0f, + -1.0f, + // +/- 0 + 0.0f, + -0.0f, + // near 0 + 0.25f, + -0.25f, + // +/- integer + 4.0f, + -32.0f, + // positive +/- delta + 2.015625f, + 3.984375f, + // negative +/- delta + -2.015625f, + -3.984375f, + + // No huge values - would interfere with sum. But add more to fill 2 * N: + -2.0f, + -10.0f, + 0.03125f, + 1.03125f, + 1.5f, + 2.0f, + 4.0f, + 5.0f, + 6.0f, + 8.0f, + 10.0f, + 256.0f, + 448.0f, + 2080.0f, + }; + const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors + auto in = AllocateAligned(padded); + auto expected = AllocateAligned(padded); + std::copy(test_cases, test_cases + kNumTestCases, in.get()); + std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f); + return in; +} + +class TestReorderDemote2To { + // In-place N^2 selection sort to avoid dependencies + void Sort(float* p, size_t count) { + for (size_t i = 0; i < count - 1; ++i) { + // Find min_element + size_t idx_min = i; + for (size_t j = i + 1; j < count; j++) { + if (p[j] < p[idx_min]) { + idx_min = j; + } + } + + // Swap with current + const float tmp = p[i]; + p[i] = p[idx_min]; + p[idx_min] = tmp; + } + } + + public: + template + HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { +#if HWY_TARGET != HWY_SCALAR + size_t padded; + auto in = ReorderBF16TestCases(d32, padded); + + using TBF16 = bfloat16_t; + const Repartition dbf16; + const Half dbf16_half; + const size_t N = Lanes(d32); + auto temp16 = AllocateAligned(2 * N); + auto expected = AllocateAligned(2 * N); + auto actual = AllocateAligned(2 * N); + + for (size_t i = 0; i < padded; i += 2 * N) { + const auto f0 = Load(d32, &in[i + 0]); + const auto f1 = Load(d32, &in[i + N]); + const auto v16 = ReorderDemote2To(dbf16, f0, f1); + Store(v16, dbf16, temp16.get()); + const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0)); + const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N)); + + // Smoke test: sum should be same (with tolerance for non-associativity) + const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1))); + const auto sum_actual = + GetLane(SumOfLanes(d32, Add(promoted0, promoted1))); + + HWY_ASSERT(sum_expected - 1E-4 <= sum_actual && + sum_actual <= sum_expected + 1E-4); + + // Ensure values are the same after sorting to undo the Reorder + Store(f0, d32, expected.get() + 0); + Store(f1, d32, expected.get() + N); + Store(promoted0, d32, actual.get() + 0); + Store(promoted1, d32, actual.get() + N); + Sort(expected.get(), 2 * N); + Sort(actual.get(), 2 * N); + HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0)); + HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N)); + } +#else // HWY_SCALAR + (void)d32; +#endif + } +}; + +HWY_NOINLINE void TestAllReorderDemote2To() { + ForShrinkableVectors()(float()); +} + +struct TestI32F64 { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = int32_t; + const Rebind di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N)))); + + // Above positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001)))); + + // Below positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999)))); + + const TF eps = static_cast(0.0001); + // Above negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), + DemoteTo(di, Iota(df, -TF(N + 1) + eps))); + + // Below negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)), + DemoteTo(di, Iota(df, -TF(N + 1) - eps))); + + // Huge positive float + HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax()), + DemoteTo(di, Set(df, TF(1E12)))); + + // Huge negative float + HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin()), + DemoteTo(di, Set(df, TF(-1E12)))); + } +}; + +HWY_NOINLINE void TestAllI32F64() { +#if HWY_HAVE_FLOAT64 + ForDemoteVectors()(double()); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // !HWY_IS_MSAN + +#if HWY_ONCE + +namespace hwy { +#if !HWY_IS_MSAN +HWY_BEFORE_TEST(HwyDemoteTest); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64); +#endif // !HWY_IS_MSAN +} // namespace hwy + +#endif diff --git a/hwy/tests/float_test.cc b/hwy/tests/float_test.cc new file mode 100644 index 0000000..05d7b76 --- /dev/null +++ b/hwy/tests/float_test.cc @@ -0,0 +1,349 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Tests some ops specific to floating-point types (Div, Round etc.) + +#include +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/float_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestDiv { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(-2)); + const auto v1 = Set(d, T(1)); + + // Unchanged after division by 1. + HWY_ASSERT_VEC_EQ(d, v, Div(v, v1)); + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = (T(i) - 2) / T(2); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Div(v, Set(d, T(2)))); + } +}; + +HWY_NOINLINE void TestAllDiv() { ForFloatTypes(ForPartialVectors()); } + +struct TestApproximateReciprocal { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(-2)); + const auto nonzero = IfThenElse(Eq(v, Zero(d)), Set(d, T(1)), v); + const size_t N = Lanes(d); + auto input = AllocateAligned(N); + Store(nonzero, d, input.get()); + + auto actual = AllocateAligned(N); + Store(ApproximateReciprocal(nonzero), d, actual.get()); + + double max_l1 = 0.0; + double worst_expected = 0.0; + double worst_actual = 0.0; + for (size_t i = 0; i < N; ++i) { + const double expected = 1.0 / input[i]; + const double l1 = std::abs(expected - actual[i]); + if (l1 > max_l1) { + max_l1 = l1; + worst_expected = expected; + worst_actual = actual[i]; + } + } + const double abs_worst_expected = std::abs(worst_expected); + if (abs_worst_expected > 1E-5) { + const double max_rel = max_l1 / abs_worst_expected; + fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel, + worst_expected, worst_actual); + HWY_ASSERT(max_rel < 0.004); + } + } +}; + +HWY_NOINLINE void TestAllApproximateReciprocal() { + ForPartialVectors()(float()); +} + +struct TestSquareRoot { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto vi = Iota(d, 0); + HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi))); + } +}; + +HWY_NOINLINE void TestAllSquareRoot() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestReciprocalSquareRoot { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Set(d, 123.0f); + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + Store(ApproximateReciprocalSqrt(v), d, lanes.get()); + for (size_t i = 0; i < N; ++i) { + float err = lanes[i] - 0.090166f; + if (err < 0.0f) err = -err; + if (err >= 4E-4f) { + HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast(i), + static_cast(N), lanes[i], err); + } + } + } +}; + +HWY_NOINLINE void TestAllReciprocalSquareRoot() { + ForPartialVectors()(float()); +} + +template +AlignedFreeUniquePtr RoundTestCases(T /*unused*/, D d, size_t& padded) { + const T eps = std::numeric_limits::epsilon(); + const T test_cases[] = { + // +/- 1 + T(1), + T(-1), + // +/- 0 + T(0), + T(-0), + // near 0 + T(0.4), + T(-0.4), + // +/- integer + T(4), + T(-32), + // positive near limit + MantissaEnd() - T(1.5), + MantissaEnd() + T(1.5), + // negative near limit + -MantissaEnd() - T(1.5), + -MantissaEnd() + T(1.5), + // positive tiebreak + T(1.5), + T(2.5), + // negative tiebreak + T(-1.5), + T(-2.5), + // positive +/- delta + T(2.0001), + T(3.9999), + // negative +/- delta + T(-999.9999), + T(-998.0001), + // positive +/- epsilon + T(1) + eps, + T(1) - eps, + // negative +/- epsilon + T(-1) + eps, + T(-1) - eps, + // +/- huge (but still fits in float) + T(1E34), + T(-1E35), + // +/- infinity + std::numeric_limits::infinity(), + -std::numeric_limits::infinity(), + // qNaN + GetLane(NaN(d)) + }; + const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors + auto in = AllocateAligned(padded); + auto expected = AllocateAligned(padded); + std::copy(test_cases, test_cases + kNumTestCases, in.get()); + std::fill(in.get() + kNumTestCases, in.get() + padded, T(0)); + return in; +} + +struct TestRound { + template + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned(padded); + + for (size_t i = 0; i < padded; ++i) { + // Avoid [std::]round, which does not round to nearest *even*. + // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see + // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html + expected[i] = static_cast(nearbyint(in[i])); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllRound() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestNearestInt { + template + HWY_NOINLINE void operator()(TF tf, const DF df) { + using TI = MakeSigned; + const RebindToSigned di; + + size_t padded; + auto in = RoundTestCases(tf, df, padded); + auto expected = AllocateAligned(padded); + + constexpr double max = static_cast(LimitsMax()); + for (size_t i = 0; i < padded; ++i) { + if (std::isnan(in[i])) { + // We replace NaN with 0 below (no_nan) + expected[i] = 0; + } else if (std::isinf(in[i]) || double{std::abs(in[i])} >= max) { + // Avoid undefined result for lrintf + expected[i] = std::signbit(in[i]) ? LimitsMin() : LimitsMax(); + } else { + expected[i] = static_cast(lrintf(in[i])); + } + } + for (size_t i = 0; i < padded; i += Lanes(df)) { + const auto v = Load(df, &in[i]); + const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df)); + HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan)); + } + } +}; + +HWY_NOINLINE void TestAllNearestInt() { + ForPartialVectors()(float()); +} + +struct TestTrunc { + template + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned(padded); + + for (size_t i = 0; i < padded; ++i) { + // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see + // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html + expected[i] = static_cast(trunc(in[i])); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllTrunc() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestCeil { + template + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned(padded); + + for (size_t i = 0; i < padded; ++i) { + expected[i] = std::ceil(in[i]); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllCeil() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestFloor { + template + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned(padded); + + for (size_t i = 0; i < padded; ++i) { + expected[i] = std::floor(in[i]); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllFloor() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestAbsDiff { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes_a = AllocateAligned(N); + auto in_lanes_b = AllocateAligned(N); + auto out_lanes = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + in_lanes_a[i] = static_cast((i ^ 1u) << i); + in_lanes_b[i] = static_cast(i << i); + out_lanes[i] = std::abs(in_lanes_a[i] - in_lanes_b[i]); + } + const auto a = Load(d, in_lanes_a.get()); + const auto b = Load(d, in_lanes_b.get()); + const auto expected = Load(d, out_lanes.get()); + HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(a, b)); + HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(b, a)); + } +}; + +HWY_NOINLINE void TestAllAbsDiff() { + ForPartialVectors()(float()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyFloatTest); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDiv); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllApproximateReciprocal); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSquareRoot); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllReciprocalSquareRoot); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllRound); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllNearestInt); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllTrunc); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllCeil); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllFloor); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllAbsDiff); +} // namespace hwy + +#endif diff --git a/hwy/tests/hwy_gtest.h b/hwy/tests/hwy_gtest.h new file mode 100644 index 0000000..acecee8 --- /dev/null +++ b/hwy/tests/hwy_gtest.h @@ -0,0 +1,157 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HWY_TESTS_HWY_GTEST_H_ +#define HWY_TESTS_HWY_GTEST_H_ + +// Adapters for GUnit to run tests for all targets. + +#include +#include + +#include +#include // std::tuple + +#include "gtest/gtest.h" +#include "hwy/highway.h" + +namespace hwy { + +// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead +// used INSTANTIATE_TEST_CASE_P which is now deprecated. +#ifdef INSTANTIATE_TEST_SUITE_P +#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P +#else +#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P +#endif + +// Helper class to run parametric tests using the hwy target as parameter. To +// use this define the following in your test: +// class MyTestSuite : public TestWithParamTarget { +// ... +// }; +// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite); +// TEST_P(MyTestSuite, MyTest) { ... } +class TestWithParamTarget : public testing::TestWithParam { + protected: + void SetUp() override { SetSupportedTargetsForTest(GetParam()); } + + void TearDown() override { + // Check that the parametric test calls SupportedTargets() when the source + // was compiled with more than one target. In the single-target case only + // static dispatch will be used anyway. +#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0 + EXPECT_TRUE(GetChosenTarget().IsInitialized()) + << "This hwy target parametric test doesn't use dynamic-dispatch and " + "doesn't need to be parametric."; +#endif + SetSupportedTargetsForTest(0); + } +}; + +// Function to convert the test parameter of a TestWithParamTarget for +// displaying it in the gtest test name. +static inline std::string TestParamTargetName( + const testing::TestParamInfo& info) { + return TargetName(info.param); +} + +#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite) \ + HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \ + suite##Group, suite, \ + testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \ + ::hwy::TestParamTargetName) + +// Helper class similar to TestWithParamTarget to run parametric tests that +// depend on the target and another parametric test. If you need to use multiple +// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as +// the generator. To use this class define the following in your test: +// class MyTestSuite : public TestWithParamTargetT { +// ... +// }; +// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9)); +// TEST_P(MyTestSuite, MyTest) { ... GetParam() .... } +template +class TestWithParamTargetAndT + : public ::testing::TestWithParam> { + public: + // Expose the parametric type here so it can be used by the + // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro. + using HwyParamType = T; + + protected: + void SetUp() override { + SetSupportedTargetsForTest(std::get<0>( + ::testing::TestWithParam>::GetParam())); + } + + void TearDown() override { + // Check that the parametric test calls SupportedTargets() when the source + // was compiled with more than one target. In the single-target case only + // static dispatch will be used anyway. +#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0 + EXPECT_TRUE(GetChosenTarget().IsInitialized()) + << "This hwy target parametric test doesn't use dynamic-dispatch and " + "doesn't need to be parametric."; +#endif + SetSupportedTargetsForTest(0); + } + + T GetParam() { + return std::get<1>( + ::testing::TestWithParam>::GetParam()); + } +}; + +template +std::string TestParamTargetNameAndT( + const testing::TestParamInfo>& info) { + return std::string(TargetName(std::get<0>(info.param))) + "_" + + ::testing::PrintToString(std::get<1>(info.param)); +} + +#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator) \ + HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \ + suite##Group, suite, \ + ::testing::Combine( \ + testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \ + generator), \ + ::hwy::TestParamTargetNameAndT) + +// Helper macro to export a function and define a test that tests it. This is +// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test: +// class MyTestSuite : public TestWithParamTarget { +// ... +// }; +// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite); +// HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest); +#define HWY_EXPORT_AND_TEST_P(suite, func_name) \ + HWY_EXPORT(func_name); \ + TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \ + static_assert(true, "For requiring trailing semicolon") + +#define HWY_EXPORT_AND_TEST_P_T(suite, func_name) \ + HWY_EXPORT(func_name); \ + TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \ + static_assert(true, "For requiring trailing semicolon") + +#define HWY_BEFORE_TEST(suite) \ + class suite : public hwy::TestWithParamTarget {}; \ + HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \ + static_assert(true, "For requiring trailing semicolon") + +} // namespace hwy + +#endif // HWY_TESTS_HWY_GTEST_H_ diff --git a/hwy/tests/if_test.cc b/hwy/tests/if_test.cc new file mode 100644 index 0000000..e44a878 --- /dev/null +++ b/hwy/tests/if_test.cc @@ -0,0 +1,175 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/if_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestIfThenElse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + auto in1 = AllocateAligned(N); + auto in2 = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + auto expected = AllocateAligned(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = static_cast(Random32(&rng)); + in2[i] = static_cast(Random32(&rng)); + bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0); + } + + const auto v1 = Load(d, in1.get()); + const auto v2 = Load(d, in2.get()); + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + for (size_t i = 0; i < N; ++i) { + expected[i] = bool_lanes[i] ? in1[i] : in2[i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = bool_lanes[i] ? in1[i] : T(0); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = bool_lanes[i] ? T(0) : in2[i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2)); + } + } +}; + +HWY_NOINLINE void TestAllIfThenElse() { + ForAllTypes(ForPartialVectors()); +} + +struct TestIfVecThenElse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TU = MakeUnsigned; // For all-one mask + const Rebind du; + const size_t N = Lanes(d); + auto in1 = AllocateAligned(N); + auto in2 = AllocateAligned(N); + auto vec_lanes = AllocateAligned(N); + auto expected = AllocateAligned(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = static_cast(Random32(&rng)); + in2[i] = static_cast(Random32(&rng)); + vec_lanes[i] = (Random32(&rng) & 16) ? static_cast(~TU(0)) : TU(0); + } + + const auto v1 = Load(d, in1.get()); + const auto v2 = Load(d, in2.get()); + const auto vec = BitCast(d, Load(du, vec_lanes.get())); + + for (size_t i = 0; i < N; ++i) { + expected[i] = vec_lanes[i] ? in1[i] : in2[i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2)); + } + } +}; + +HWY_NOINLINE void TestAllIfVecThenElse() { + ForAllTypes(ForPartialVectors()); +} + +struct TestZeroIfNegative { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp = Iota(d, 1); + const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5 + + // Zero and positive remain unchanged + HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0)); + HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp)); + + // Negative are all replaced with zero + HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn)); + } +}; + +HWY_NOINLINE void TestAllZeroIfNegative() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestIfNegative { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp = Iota(d, 1); + const auto vn = Or(vp, SignBit(d)); + + // Zero and positive remain unchanged + HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0)); + HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn)); + HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp)); + HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn)); + + // Negative are replaced with 2nd arg + HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp)); + HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0)); + HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn)); + } +}; + +HWY_NOINLINE void TestAllIfNegative() { + ForFloatTypes(ForPartialVectors()); + ForSignedTypes(ForPartialVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyIfTest); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfThenElse); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfVecThenElse); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllZeroIfNegative); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfNegative); +} // namespace hwy + +#endif diff --git a/hwy/tests/interleaved_test.cc b/hwy/tests/interleaved_test.cc new file mode 100644 index 0000000..4d1fbd5 --- /dev/null +++ b/hwy/tests/interleaved_test.cc @@ -0,0 +1,256 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/interleaved_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestLoadStoreInterleaved2 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned(2 * N); + for (size_t i = 0; i < 2 * N; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned(3 * N); + auto actual_aligned = AllocateAligned(3 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[2 * i + 0] = bytes[0 * N + i]; + expected[2 * i + 1] = bytes[1 * N + i]; + // Ensure we do not write more than 2*N bytes + expected[2 * N + i] = actual[2 * N + i] = 0; + } + StoreInterleaved2(in0, in1, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) { + Print(d, "in0", in0, pos / 4); + Print(d, "in1", in1, pos / 4); + const size_t i = pos; + fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f %f %f\n", + static_cast(i), static_cast(actual[i]), + static_cast(actual[i + 1]), + static_cast(actual[i + 2]), + static_cast(actual[i + 3]), + static_cast(actual[i + 4]), + static_cast(actual[i + 5]), + static_cast(actual[i + 6]), + static_cast(actual[i + 7])); + HWY_ASSERT(false); + } + + Vec out0, out1; + LoadInterleaved2(d, actual, out0, out1); + HWY_ASSERT_VEC_EQ(d, in0, out0); + HWY_ASSERT_VEC_EQ(d, in1, out1); + } + } +}; + +HWY_NOINLINE void TestAllLoadStoreInterleaved2() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors test; +#else + const ForPartialVectors test; +#endif + ForAllTypes(test); +} + +// Workaround for build timeout on GCC 12 aarch64, see #776 +#if HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_ARCH_ARM_A64 +#define HWY_BROKEN_LOAD34 1 +#else +#define HWY_BROKEN_LOAD34 0 +#endif + +#if !HWY_BROKEN_LOAD34 + +struct TestLoadStoreInterleaved3 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned(3 * N); + for (size_t i = 0; i < 3 * N; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + const auto in2 = Load(d, &bytes[2 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned(4 * N); + auto actual_aligned = AllocateAligned(4 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[3 * i + 0] = bytes[0 * N + i]; + expected[3 * i + 1] = bytes[1 * N + i]; + expected[3 * i + 2] = bytes[2 * N + i]; + // Ensure we do not write more than 3*N bytes + expected[3 * N + i] = actual[3 * N + i] = 0; + } + StoreInterleaved3(in0, in1, in2, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) { + Print(d, "in0", in0, pos / 3, N); + Print(d, "in1", in1, pos / 3, N); + Print(d, "in2", in2, pos / 3, N); + const size_t i = pos; + fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f\n", + static_cast(i), static_cast(actual[i]), + static_cast(actual[i + 1]), + static_cast(actual[i + 2]), + static_cast(actual[i + 3]), + static_cast(actual[i + 4]), + static_cast(actual[i + 5])); + HWY_ASSERT(false); + } + + Vec out0, out1, out2; + LoadInterleaved3(d, actual, out0, out1, out2); + HWY_ASSERT_VEC_EQ(d, in0, out0); + HWY_ASSERT_VEC_EQ(d, in1, out1); + HWY_ASSERT_VEC_EQ(d, in2, out2); + } + } +}; + +HWY_NOINLINE void TestAllLoadStoreInterleaved3() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors test; +#else + const ForPartialVectors test; +#endif + ForAllTypes(test); +} + +struct TestLoadStoreInterleaved4 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned(4 * N); + + for (size_t i = 0; i < 4 * N; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + const auto in2 = Load(d, &bytes[2 * N]); + const auto in3 = Load(d, &bytes[3 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned(5 * N); + auto actual_aligned = AllocateAligned(5 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[4 * i + 0] = bytes[0 * N + i]; + expected[4 * i + 1] = bytes[1 * N + i]; + expected[4 * i + 2] = bytes[2 * N + i]; + expected[4 * i + 3] = bytes[3 * N + i]; + // Ensure we do not write more than 4*N bytes + expected[4 * N + i] = actual[4 * N + i] = 0; + } + StoreInterleaved4(in0, in1, in2, in3, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) { + Print(d, "in0", in0, pos / 4); + Print(d, "in1", in1, pos / 4); + Print(d, "in2", in2, pos / 4); + Print(d, "in3", in3, pos / 4); + const size_t i = pos; + fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f %f %f\n", + static_cast(i), static_cast(actual[i]), + static_cast(actual[i + 1]), + static_cast(actual[i + 2]), + static_cast(actual[i + 3]), + static_cast(actual[i + 4]), + static_cast(actual[i + 5]), + static_cast(actual[i + 6]), + static_cast(actual[i + 7])); + HWY_ASSERT(false); + } + + Vec out0, out1, out2, out3; + LoadInterleaved4(d, actual, out0, out1, out2, out3); + HWY_ASSERT_VEC_EQ(d, in0, out0); + HWY_ASSERT_VEC_EQ(d, in1, out1); + HWY_ASSERT_VEC_EQ(d, in2, out2); + HWY_ASSERT_VEC_EQ(d, in3, out3); + } + } +}; + +HWY_NOINLINE void TestAllLoadStoreInterleaved4() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors test; +#else + const ForPartialVectors test; +#endif + ForAllTypes(test); +} + +#endif // !HWY_BROKEN_LOAD34 + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyInterleavedTest); +HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved2); +#if !HWY_BROKEN_LOAD34 +HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved3); +HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved4); +#endif +} // namespace hwy + +#endif diff --git a/hwy/tests/list_targets.cc b/hwy/tests/list_targets.cc new file mode 100644 index 0000000..d09ee4f --- /dev/null +++ b/hwy/tests/list_targets.cc @@ -0,0 +1,71 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Simple tool to print the list of targets that were compiled in when building +// this tool. + +#include + +#include "hwy/highway.h" + +void PrintTargets(const char* msg, int64_t targets) { + fprintf(stderr, "%s", msg); + // For each bit: + for (int64_t x = targets; x != 0; x = x & (x - 1)) { + // Extract value of least-significant bit. + fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1))); + } + fprintf(stderr, "\n"); +} + +int main() { +#ifdef HWY_COMPILE_ONLY_EMU128 + const int only_emu128 = 1; +#else + const int only_emu128 = 0; +#endif +#ifdef HWY_COMPILE_ONLY_SCALAR + const int only_scalar = 1; +#else + const int only_scalar = 0; +#endif +#ifdef HWY_COMPILE_ONLY_STATIC + const int only_static = 1; +#else + const int only_static = 0; +#endif +#ifdef HWY_COMPILE_ALL_ATTAINABLE + const int all_attain = 1; +#else + const int all_attain = 0; +#endif +#ifdef HWY_IS_TEST + const int is_test = 1; +#else + const int is_test = 0; +#endif + + fprintf(stderr, + "Config: emu128:%d scalar:%d static:%d all_attain:%d is_test:%d\n", + only_emu128, only_scalar, only_static, all_attain, is_test); + PrintTargets("Compiled HWY_TARGETS: ", HWY_TARGETS); + PrintTargets("HWY_ATTAINABLE_TARGETS:", HWY_ATTAINABLE_TARGETS); + PrintTargets("HWY_BASELINE_TARGETS: ", HWY_BASELINE_TARGETS); + PrintTargets("HWY_STATIC_TARGET: ", HWY_STATIC_TARGET); + PrintTargets("HWY_BROKEN_TARGETS: ", HWY_BROKEN_TARGETS); + PrintTargets("HWY_DISABLED_TARGETS: ", HWY_DISABLED_TARGETS); + PrintTargets("Current CPU supports: ", hwy::SupportedTargets()); + return 0; +} diff --git a/hwy/tests/logical_test.cc b/hwy/tests/logical_test.cc new file mode 100644 index 0000000..fa2b9b9 --- /dev/null +++ b/hwy/tests/logical_test.cc @@ -0,0 +1,270 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // memcmp + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/logical_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestLogicalInteger { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vi = Iota(d, 0); + const auto ones = VecFromMask(d, Eq(v0, v0)); + const auto v1 = Set(d, 1); + const auto vnot1 = Set(d, T(~T(1))); + + HWY_ASSERT_VEC_EQ(d, v0, Not(ones)); + HWY_ASSERT_VEC_EQ(d, ones, Not(v0)); + HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1)); + HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1)); + + HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, v0, Or3(v0, v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, vi)); + + HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi)); + + auto v = vi; + v = And(v, vi); + HWY_ASSERT_VEC_EQ(d, vi, v); + v = And(v, v0); + HWY_ASSERT_VEC_EQ(d, v0, v); + + v = Or(v, vi); + HWY_ASSERT_VEC_EQ(d, vi, v); + v = Or(v, v0); + HWY_ASSERT_VEC_EQ(d, vi, v); + + v = Xor(v, vi); + HWY_ASSERT_VEC_EQ(d, v0, v); + v = Xor(v, v0); + HWY_ASSERT_VEC_EQ(d, v0, v); + } +}; + +HWY_NOINLINE void TestAllLogicalInteger() { + ForIntegerTypes(ForPartialVectors()); +} + +struct TestLogicalFloat { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vi = Iota(d, 0); + + HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi)); + + auto v = vi; + v = And(v, vi); + HWY_ASSERT_VEC_EQ(d, vi, v); + v = And(v, v0); + HWY_ASSERT_VEC_EQ(d, v0, v); + + v = Or(v, vi); + HWY_ASSERT_VEC_EQ(d, vi, v); + v = Or(v, v0); + HWY_ASSERT_VEC_EQ(d, vi, v); + + v = Xor(v, vi); + HWY_ASSERT_VEC_EQ(d, v0, v); + v = Xor(v, v0); + HWY_ASSERT_VEC_EQ(d, v0, v); + } +}; + +HWY_NOINLINE void TestAllLogicalFloat() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestCopySign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp = Iota(d, 1); + const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5 + + // Zero remains zero regardless of sign + HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp)); + HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn)); + HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp)); + HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn)); + + // Positive input, positive sign => unchanged + HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp)); + HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp)); + + // Positive input, negative sign => negated + HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn)); + HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn)); + + // Negative input, negative sign => unchanged + HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn)); + + // Negative input, positive sign => negated + HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp)); + } +}; + +HWY_NOINLINE void TestAllCopySign() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestBroadcastSignBit { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto s0 = Zero(d); + const auto s1 = Set(d, -1); // all bit set + const auto vpos = And(Iota(d, 0), Set(d, LimitsMax())); + const auto vneg = Sub(s1, vpos); + + HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos)); + HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax()))); + + HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg)); + HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin()))); + HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin() / 2))); + } +}; + +HWY_NOINLINE void TestAllBroadcastSignBit() { + ForSignedTypes(ForPartialVectors()); +} + +struct TestTestBit { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t kNumBits = sizeof(T) * 8; + for (size_t i = 0; i < kNumBits; ++i) { + const auto bit1 = Set(d, T(1ull << i)); + const auto bit2 = Set(d, T(1ull << ((i + 1) % kNumBits))); + const auto bit3 = Set(d, T(1ull << ((i + 2) % kNumBits))); + const auto bits12 = Or(bit1, bit2); + const auto bits23 = Or(bit2, bit3); + HWY_ASSERT(AllTrue(d, TestBit(bit1, bit1))); + HWY_ASSERT(AllTrue(d, TestBit(bits12, bit1))); + HWY_ASSERT(AllTrue(d, TestBit(bits12, bit2))); + + HWY_ASSERT(AllFalse(d, TestBit(bits12, bit3))); + HWY_ASSERT(AllFalse(d, TestBit(bits23, bit1))); + HWY_ASSERT(AllFalse(d, TestBit(bit1, bit2))); + HWY_ASSERT(AllFalse(d, TestBit(bit2, bit1))); + HWY_ASSERT(AllFalse(d, TestBit(bit1, bit3))); + HWY_ASSERT(AllFalse(d, TestBit(bit3, bit1))); + HWY_ASSERT(AllFalse(d, TestBit(bit2, bit3))); + HWY_ASSERT(AllFalse(d, TestBit(bit3, bit2))); + } + } +}; + +HWY_NOINLINE void TestAllTestBit() { + ForIntegerTypes(ForPartialVectors()); +} + +struct TestPopulationCount { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + size_t N = Lanes(d); + auto data = AllocateAligned(N); + auto popcnt = AllocateAligned(N); + for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) { + for (size_t i = 0; i < N; i++) { + data[i] = static_cast(rng()); + popcnt[i] = static_cast(PopCount(data[i])); + } + HWY_ASSERT_VEC_EQ(d, popcnt.get(), PopulationCount(Load(d, data.get()))); + } + } +}; + +HWY_NOINLINE void TestAllPopulationCount() { + ForUnsignedTypes(ForPartialVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyLogicalTest); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount); +} // namespace hwy + +#endif diff --git a/hwy/tests/mask_mem_test.cc b/hwy/tests/mask_mem_test.cc new file mode 100644 index 0000000..c44119d --- /dev/null +++ b/hwy/tests/mask_mem_test.cc @@ -0,0 +1,197 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include +#include +#include // memcmp + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mask_mem_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestMaskedLoad { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + + auto lanes = AllocateAligned(N); + Store(Iota(d, T{1}), d, lanes.get()); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + const auto expected = IfThenElseZero(mask, Load(d, lanes.get())); + const auto actual = MaskedLoad(mask, d, lanes.get()); + HWY_ASSERT_VEC_EQ(d, expected, actual); + } + } +}; + +HWY_NOINLINE void TestAllMaskedLoad() { + ForAllTypes(ForPartialVectors()); +} + +struct TestBlendedStore { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + + const Vec v = Iota(d, T{1}); + auto actual = AllocateAligned(N); + auto expected = AllocateAligned(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + // Re-initialize to something distinct from v[i]. + actual[i] = static_cast(127 - (i & 127)); + expected[i] = bool_lanes[i] ? static_cast(i + 1) : actual[i]; + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + BlendedStore(v, mask, d, actual.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get())); + } + } +}; + +HWY_NOINLINE void TestAllBlendedStore() { + ForAllTypes(ForPartialVectors()); +} + +class TestStoreMaskBits { + public: + template + HWY_NOINLINE void operator()(T /*t*/, D /*d*/) { + RandomState rng; + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + + const ScalableTag d_bits; + const size_t expected_num_bytes = (N + 7) / 8; + auto expected = AllocateAligned(expected_num_bytes); + auto actual = AllocateAligned(HWY_MAX(8, expected_num_bytes)); + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + // Generate random mask pattern. + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = static_cast((rng() & 1024) ? 1 : 0); + } + const auto bools = Load(di, bool_lanes.get()); + const auto mask = Gt(bools, Zero(di)); + + // Requires at least 8 bytes, ensured above. + const size_t bytes_written = StoreMaskBits(di, mask, actual.get()); + if (bytes_written != expected_num_bytes) { + fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n", + TypeName(T(), N).c_str(), + static_cast(expected_num_bytes), + static_cast(bytes_written)); + + HWY_ASSERT(false); + } + + // Requires at least 8 bytes, ensured above. + const auto mask2 = LoadMaskBits(di, actual.get()); + HWY_ASSERT_MASK_EQ(di, mask, mask2); + + memset(expected.get(), 0, expected_num_bytes); + for (size_t i = 0; i < N; ++i) { + expected[i / 8] = + static_cast(expected[i / 8] | (bool_lanes[i] << (i % 8))); + } + + size_t i = 0; + // Stored bits must match original mask + for (; i < N; ++i) { + const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0; + if (is_set != bool_lanes[i]) { + fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n", + TypeName(T(), N).c_str(), static_cast(i), + static_cast(bool_lanes[i]), static_cast(is_set)); + Print(di, "bools", bools, 0, N); + Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0, + expected_num_bytes); + Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0, + expected_num_bytes); + + HWY_ASSERT(false); + } + } + // Any partial bits in the last byte must be zero + for (; i < 8 * bytes_written; ++i) { + const int bit = (actual[i / 8] & (1 << (i % 8))); + if (bit != 0) { + fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n", + TypeName(T(), N).c_str(), static_cast(i)); + Print(di, "bools", bools, 0, N); + Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0, + expected_num_bytes); + Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0, + expected_num_bytes); + + HWY_ASSERT(false); + } + } + } + } +}; + +HWY_NOINLINE void TestAllStoreMaskBits() { + ForAllTypes(ForPartialVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMaskTest); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits); +} // namespace hwy + +#endif diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc new file mode 100644 index 0000000..f48b476 --- /dev/null +++ b/hwy/tests/mask_test.cc @@ -0,0 +1,293 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // memcmp + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mask_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// All types. +struct TestFromVec { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + + memset(lanes.get(), 0, N * sizeof(T)); + const auto actual_false = MaskFromVec(Load(d, lanes.get())); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false); + + memset(lanes.get(), 0xFF, N * sizeof(T)); + const auto actual_true = MaskFromVec(Load(d, lanes.get())); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true); + } +}; + +HWY_NOINLINE void TestAllFromVec() { + ForAllTypes(ForPartialVectors()); +} + +struct TestFirstN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + + using TN = SignedFromSize; + const size_t max_len = static_cast(LimitsMax()); + + const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); + for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { + // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (i < len) ? T{1} : 0; + } + const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1})); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); + } + + // Also ensure huge values yield all-true (unless the vector is actually + // larger than max_len). + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (i < max_len) ? T{1} : 0; + } + const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1})); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); + } +}; + +HWY_NOINLINE void TestAllFirstN() { + ForAllTypes(ForPartialVectors()); +} + +struct TestMaskVec { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); + } + } +}; + +HWY_NOINLINE void TestAllMaskVec() { + const ForPartialVectors test; + + test(uint16_t()); + test(int16_t()); + // TODO(janwas): float16_t - cannot compare yet + + ForUIF3264(test); +} + +struct TestAllTrueFalse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto zero = Zero(d); + auto v = zero; + + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + std::fill(lanes.get(), lanes.get() + N, T(0)); + + HWY_ASSERT(AllTrue(d, Eq(v, zero))); + HWY_ASSERT(!AllFalse(d, Eq(v, zero))); + + // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple + // lanes and one is nonzero. + const bool expected_all_false = (N != 1); + + // Set each lane to nonzero and back to zero + for (size_t i = 0; i < N; ++i) { + lanes[i] = T(1); + v = Load(d, lanes.get()); + + HWY_ASSERT(!AllTrue(d, Eq(v, zero))); + + HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero))); + + lanes[i] = T(-1); + v = Load(d, lanes.get()); + HWY_ASSERT(!AllTrue(d, Eq(v, zero))); + HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero))); + + // Reset to all zero + lanes[i] = T(0); + v = Load(d, lanes.get()); + HWY_ASSERT(AllTrue(d, Eq(v, zero))); + HWY_ASSERT(!AllFalse(d, Eq(v, zero))); + } + } +}; + +HWY_NOINLINE void TestAllAllTrueFalse() { + ForAllTypes(ForPartialVectors()); +} + +struct TestCountTrue { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = HWY_MIN(N, size_t(10)); + + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + // Number of zeros written = number of mask lanes that are true. + size_t expected = 0; + for (size_t i = 0; i < max_lanes; ++i) { + const bool is_true = (code & (1ull << i)) != 0; + bool_lanes[i] = is_true ? TI(1) : TI(0); + expected += is_true; + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + const size_t actual = CountTrue(d, mask); + HWY_ASSERT_EQ(expected, actual); + } + } +}; + +HWY_NOINLINE void TestAllCountTrue() { + ForAllTypes(ForPartialVectors()); +} + +struct TestFindFirstTrue { // Also FindKnownFirstTrue + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9))); + + HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d))); + HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d))); + HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d))); + + for (size_t code = 1; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const size_t expected = + Num0BitsBelowLS1Bit_Nonzero32(static_cast(code)); + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + HWY_ASSERT_EQ(static_cast(expected), FindFirstTrue(d, mask)); + HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask)); + } + } +}; + +HWY_NOINLINE void TestAllFindFirstTrue() { + ForAllTypes(ForPartialVectors()); +} + +struct TestLogicalMask { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto m0 = MaskFalse(d); + const auto m_all = MaskTrue(d); + + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + HWY_ASSERT_MASK_EQ(d, m0, Not(m_all)); + HWY_ASSERT_MASK_EQ(d, m_all, Not(m0)); + + Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0))); + HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0)); + HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0)); + HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m)); + HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m)); + HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m)); + + HWY_ASSERT_MASK_EQ(d, m, Or(m, m)); + HWY_ASSERT_MASK_EQ(d, m, Or(m0, m)); + HWY_ASSERT_MASK_EQ(d, m, Or(m, m0)); + HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m)); + HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0)); + HWY_ASSERT_MASK_EQ(d, m, And(m, m)); + HWY_ASSERT_MASK_EQ(d, m, And(m_all, m)); + HWY_ASSERT_MASK_EQ(d, m, And(m, m_all)); + HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m)); + } + } +}; + +HWY_NOINLINE void TestAllLogicalMask() { + ForAllTypes(ForPartialVectors()); +} +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMaskTest); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask); +} // namespace hwy + +#endif diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc new file mode 100644 index 0000000..b78be2b --- /dev/null +++ b/hwy/tests/memory_test.cc @@ -0,0 +1,341 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are +// detected. Must come before Highway headers. +#include "hwy/base.h" +#if defined(_WIN32) || defined(_WIN64) +#include +#endif + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/memory_test.cc" +#include "hwy/cache_control.h" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestLoadStore { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto hi = Iota(d, static_cast(1 + N)); + const auto lo = Iota(d, 1); + auto lanes = AllocateAligned(2 * N); + Store(hi, d, &lanes[N]); + Store(lo, d, &lanes[0]); + + // Aligned load + const auto lo2 = Load(d, &lanes[0]); + HWY_ASSERT_VEC_EQ(d, lo2, lo); + + // Aligned store + auto lanes2 = AllocateAligned(2 * N); + Store(lo2, d, &lanes2[0]); + Store(hi, d, &lanes2[N]); + for (size_t i = 0; i < 2 * N; ++i) { + HWY_ASSERT_EQ(lanes[i], lanes2[i]); + } + + // Unaligned load + const auto vu = LoadU(d, &lanes[1]); + auto lanes3 = AllocateAligned(N); + Store(vu, d, lanes3.get()); + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT_EQ(T(i + 2), lanes3[i]); + } + + // Unaligned store + StoreU(lo2, d, &lanes2[N / 2]); + size_t i = 0; + for (; i < N / 2; ++i) { + HWY_ASSERT_EQ(lanes[i], lanes2[i]); + } + for (; i < 3 * N / 2; ++i) { + HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]); + } + // Subsequent values remain unchanged. + for (; i < 2 * N; ++i) { + HWY_ASSERT_EQ(T(i + 1), lanes2[i]); + } + } +}; + +HWY_NOINLINE void TestAllLoadStore() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSafeCopyN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto v = Iota(d, 1); + auto from = AllocateAligned(N + 2); + auto to = AllocateAligned(N + 2); + Store(v, d, from.get()); + + // 0: nothing changes + to[0] = T(); + SafeCopyN(0, d, from.get(), to.get()); + HWY_ASSERT_EQ(T(), to[0]); + + // 1: only first changes + to[1] = T(); + SafeCopyN(1, d, from.get(), to.get()); + HWY_ASSERT_EQ(static_cast(1), to[0]); + HWY_ASSERT_EQ(T(), to[1]); + + // N-1: last does not change + to[N - 1] = T(); + SafeCopyN(N - 1, d, from.get(), to.get()); + HWY_ASSERT_EQ(T(), to[N - 1]); + // Also check preceding lanes + to[N - 1] = static_cast(N); + HWY_ASSERT_VEC_EQ(d, to.get(), v); + + // N: all change + to[N] = T(); + SafeCopyN(N, d, from.get(), to.get()); + HWY_ASSERT_VEC_EQ(d, to.get(), v); + HWY_ASSERT_EQ(T(), to[N]); + + // N+1: subsequent lane does not change if using masked store + to[N + 1] = T(); + SafeCopyN(N + 1, d, from.get(), to.get()); + HWY_ASSERT_VEC_EQ(d, to.get(), v); +#if !HWY_MEM_OPS_MIGHT_FAULT + HWY_ASSERT_EQ(T(), to[N + 1]); +#endif + } +}; + +HWY_NOINLINE void TestAllSafeCopyN() { + ForAllTypes(ForPartialVectors()); +} + +struct TestLoadDup128 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define LoadDup128. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + constexpr size_t N128 = 16 / sizeof(T); + alignas(16) T lanes[N128]; + for (size_t i = 0; i < N128; ++i) { + lanes[i] = static_cast(1 + i); + } + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast(i % N128 + 1); + } + + HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes)); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllLoadDup128() { + ForAllTypes(ForGEVectors<128, TestLoadDup128>()); +} + +struct TestStream { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(1)); + const size_t affected_bytes = + (Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) & + ~size_t(HWY_STREAM_MULTIPLE - 1); + const size_t affected_lanes = affected_bytes / sizeof(T); + auto out = AllocateAligned(2 * affected_lanes); + std::fill(out.get(), out.get() + 2 * affected_lanes, T(0)); + + Stream(v, d, out.get()); + FlushStream(); + const auto actual = Load(d, out.get()); + HWY_ASSERT_VEC_EQ(d, v, actual); + // Ensure Stream didn't modify more memory than expected + for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) { + HWY_ASSERT_EQ(T(0), out[i]); + } + } +}; + +HWY_NOINLINE void TestAllStream() { + const ForPartialVectors test; + // No u8,u16. + test(uint32_t()); + test(uint64_t()); + // No i8,i16. + test(int32_t()); + test(int64_t()); + ForFloatTypes(test); +} + +// Assumes little-endian byte order! +struct TestScatter { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Offset = MakeSigned; + + const size_t N = Lanes(d); + const size_t range = 4 * N; // number of items to scatter + const size_t max_bytes = range * sizeof(T); // upper bound on offset + + RandomState rng; + + // Data to be scattered + auto bytes = AllocateAligned(max_bytes); + for (size_t i = 0; i < max_bytes; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + const auto data = Load(d, reinterpret_cast(bytes.get())); + + // Scatter into these regions, ensure vector results match scalar + auto expected = AllocateAligned(range); + auto actual = AllocateAligned(range); + + const Rebind d_offsets; + auto offsets = AllocateAligned(N); // or indices + + for (size_t rep = 0; rep < 100; ++rep) { + // Byte offsets + std::fill(expected.get(), expected.get() + range, T(0)); + std::fill(actual.get(), actual.get() + range, T(0)); + for (size_t i = 0; i < N; ++i) { + // Must be aligned + offsets[i] = static_cast((Random32(&rng) % range) * sizeof(T)); + CopyBytes( + bytes.get() + i * sizeof(T), + reinterpret_cast(expected.get()) + offsets[i]); + } + const auto voffsets = Load(d_offsets, offsets.get()); + ScatterOffset(data, d, actual.get(), voffsets); + if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { + Print(d, "Data", data); + Print(d_offsets, "Offsets", voffsets); + HWY_ASSERT(false); + } + + // Indices + std::fill(expected.get(), expected.get() + range, T(0)); + std::fill(actual.get(), actual.get() + range, T(0)); + for (size_t i = 0; i < N; ++i) { + offsets[i] = static_cast(Random32(&rng) % range); + CopyBytes(bytes.get() + i * sizeof(T), + &expected[size_t(offsets[i])]); + } + const auto vindices = Load(d_offsets, offsets.get()); + ScatterIndex(data, d, actual.get(), vindices); + if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { + Print(d, "Data", data); + Print(d_offsets, "Indices", vindices); + HWY_ASSERT(false); + } + } + } +}; + +HWY_NOINLINE void TestAllScatter() { + ForUIF3264(ForPartialVectors()); +} + +struct TestGather { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Offset = MakeSigned; + + const size_t N = Lanes(d); + const size_t range = 4 * N; // number of items to gather + const size_t max_bytes = range * sizeof(T); // upper bound on offset + + RandomState rng; + + // Data to be gathered from + auto bytes = AllocateAligned(max_bytes); + for (size_t i = 0; i < max_bytes; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + + auto expected = AllocateAligned(N); + auto offsets = AllocateAligned(N); + auto indices = AllocateAligned(N); + + for (size_t rep = 0; rep < 100; ++rep) { + // Offsets + for (size_t i = 0; i < N; ++i) { + // Must be aligned + offsets[i] = static_cast((Random32(&rng) % range) * sizeof(T)); + CopyBytes(bytes.get() + offsets[i], &expected[i]); + } + + const Rebind d_offset; + const T* base = reinterpret_cast(bytes.get()); + auto actual = GatherOffset(d, base, Load(d_offset, offsets.get())); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual); + + // Indices + for (size_t i = 0; i < N; ++i) { + indices[i] = + static_cast(Random32(&rng) % (max_bytes / sizeof(T))); + CopyBytes(base + indices[i], &expected[i]); + } + actual = GatherIndex(d, base, Load(d_offset, indices.get())); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual); + } + } +}; + +HWY_NOINLINE void TestAllGather() { + ForUIF3264(ForPartialVectors()); +} + +HWY_NOINLINE void TestAllCache() { + LoadFence(); + FlushStream(); + int test = 0; + Prefetch(&test); + FlushCacheline(&test); + Pause(); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMemoryTest); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache); +} // namespace hwy + +#endif diff --git a/hwy/tests/mul_test.cc b/hwy/tests/mul_test.cc new file mode 100644 index 0000000..fab4292 --- /dev/null +++ b/hwy/tests/mul_test.cc @@ -0,0 +1,446 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mul_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +constexpr uint64_t FirstBits() { + return (1ull << kBits) - 1; +} +template <> +constexpr uint64_t FirstBits<64>() { + return ~uint64_t{0}; +} + +struct TestUnsignedMul { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto v1 = Set(d, T(1)); + const auto vi = Iota(d, 1); + const auto vj = Iota(d, 3); + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((1 + i) * (1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((1 + i) * (3 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vj)); + + const T max = LimitsMax(); + const auto vmax = Set(d, max); + HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1)); + HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax)); + + constexpr uint64_t kMask = FirstBits(); + const T max2 = (static_cast(max) * max) & kMask; + HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax)); + } +}; + +struct TestSignedMul { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + const auto v0 = Zero(d); + const auto v1 = Set(d, T(1)); + const auto vi = Iota(d, 1); + const auto vn = Iota(d, -T(N)); // no i8 supported, so no wraparound + HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((1 + i) * (1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((-T(N) + T(i)) * T(1u + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi)); + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn)); + } +}; + +HWY_NOINLINE void TestAllMul() { + const ForPartialVectors test_unsigned; + // No u8. + test_unsigned(uint16_t()); + test_unsigned(uint32_t()); + test_unsigned(uint64_t()); + + const ForPartialVectors test_signed; + // No i8. + test_signed(int16_t()); + test_signed(int32_t()); + test_signed(int64_t()); +} + +struct TestMulHigh { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Wide = MakeWide; + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned(N); + auto expected_lanes = AllocateAligned(N); + + const auto vi = Iota(d, 1); + // no i8 supported, so no wraparound + const auto vni = Iota(d, T(static_cast(~N + 1))); + + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, MulHigh(vi, v0)); + + // Large positive squared + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = T(LimitsMax() >> i); + expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16); + } + auto v = Load(d, in_lanes.get()); + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v)); + + // Large positive * small positive + for (size_t i = 0; i < N; ++i) { + expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16); + } + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi)); + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v)); + + // Large positive * small negative + for (size_t i = 0; i < N; ++i) { + expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16); + } + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni)); + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v)); + } +}; + +HWY_NOINLINE void TestAllMulHigh() { + ForPartialVectors test; + test(int16_t()); + test(uint16_t()); +} + +struct TestMulFixedPoint15 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0)); + + const size_t N = Lanes(d); + auto in1 = AllocateAligned(N); + auto in2 = AllocateAligned(N); + auto expected = AllocateAligned(N); + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(10000); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = static_cast(Random64(&rng) & 0xFFFF); + in2[i] = static_cast(Random64(&rng) & 0xFFFF); + } + + for (size_t i = 0; i < N; ++i) { + // There are three ways to compute the results. x86 and ARM are defined + // using 32-bit multiplication results: + const int arm = (2 * in1[i] * in2[i] + 0x8000) >> 16; + const int x86 = (((in1[i] * in2[i]) >> 14) + 1) >> 1; + // On other platforms, split the result into upper and lower 16 bits. + const auto v1 = Set(d, in1[i]); + const auto v2 = Set(d, in2[i]); + const int hi = GetLane(MulHigh(v1, v2)); + const int lo = GetLane(Mul(v1, v2)) & 0xFFFF; + const int split = 2 * hi + ((lo + 0x4000) >> 15); + expected[i] = static_cast(arm); + if (in1[i] != -32768 || in2[i] != -32768) { + HWY_ASSERT_EQ(arm, x86); + HWY_ASSERT_EQ(arm, split); + } + } + + const auto a = Load(d, in1.get()); + const auto b = Load(d, in2.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulFixedPoint15(a, b)); + } + } +}; + +HWY_NOINLINE void TestAllMulFixedPoint15() { + ForPartialVectors()(int16_t()); +} + +struct TestMulEven { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Wide = MakeWide; + const Repartition d2; + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), MulEven(v0, v0)); + + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned(N); + auto expected = AllocateAligned(Lanes(d2)); + for (size_t i = 0; i < N; i += 2) { + in_lanes[i + 0] = LimitsMax() >> i; + if (N != 1) { + in_lanes[i + 1] = 1; // unused + } + expected[i / 2] = Wide(in_lanes[i + 0]) * in_lanes[i + 0]; + } + + const auto v = Load(d, in_lanes.get()); + HWY_ASSERT_VEC_EQ(d2, expected.get(), MulEven(v, v)); + } +}; + +struct TestMulEvenOdd64 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0)); + HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0)); + + const size_t N = Lanes(d); + if (N == 1) return; + + auto in1 = AllocateAligned(N); + auto in2 = AllocateAligned(N); + auto expected_even = AllocateAligned(N); + auto expected_odd = AllocateAligned(N); + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = Random64(&rng); + in2[i] = Random64(&rng); + } + + for (size_t i = 0; i < N; i += 2) { + expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]); + expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]); + } + + const auto a = Load(d, in1.get()); + const auto b = Load(d, in2.get()); + HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b)); + HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b)); + } +#else + (void)d; +#endif // HWY_TARGET != HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllMulEven() { + ForGEVectors<64, TestMulEven> test; + test(int32_t()); + test(uint32_t()); + + ForGEVectors<128, TestMulEvenOdd64>()(uint64_t()); +} + +#ifndef HWY_NATIVE_FMA +#error "Bug in set_macros-inl.h, did not set HWY_NATIVE_FMA" +#endif + +struct TestMulAdd { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto k0 = Zero(d); + const auto kNeg0 = Set(d, T(-0.0)); + const auto v1 = Iota(d, 1); + const auto v2 = Iota(d, 2); + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + HWY_ASSERT_VEC_EQ(d, k0, MulAdd(k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, v2, MulAdd(k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, MulAdd(v1, k0, v2)); + HWY_ASSERT_VEC_EQ(d, k0, NegMulAdd(k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(v1, k0, v2)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((i + 1) * (i + 2)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v1, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v1, v2, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v1, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v1, Neg(v2), k0)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((i + 2) * (i + 2) + (i + 1)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v2, v1)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = + T(-T(i + 2u) * static_cast(i + 2) + static_cast(1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1)); + + HWY_ASSERT_VEC_EQ(d, k0, MulSub(k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, kNeg0, NegMulSub(k0, k0, k0)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = -T(i + 2); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, k0, v2)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(k0), v1, v2)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v1, Neg(k0), v2)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((i + 1) * (i + 2)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, v2, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v1, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v1), v2, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v2, Neg(v1), k0)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((i + 2) * (i + 2) - (1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v2, v1)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v2), v2, v1)); + } +}; + +HWY_NOINLINE void TestAllMulAdd() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestReorderWidenMulAccumulate { + template + HWY_NOINLINE void operator()(TN /*unused*/, DN dn) { + using TW = MakeWide; + const RepartitionToWide dw; + const Half dnh; + using VW = Vec; + using VN = Vec; + const size_t NN = Lanes(dn); + + const VW f0 = Zero(dw); + const VW f1 = Set(dw, TW{1}); + const VN bf0 = Zero(dn); + // Cannot Set() bfloat16_t directly. + const VN bf1 = ReorderDemote2To(dn, f1, f1); + + // Any input zero => both outputs zero + VW sum1 = f0; + HWY_ASSERT_VEC_EQ(dw, f0, + ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1)); + HWY_ASSERT_VEC_EQ(dw, f0, sum1); + HWY_ASSERT_VEC_EQ(dw, f0, + ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1)); + HWY_ASSERT_VEC_EQ(dw, f0, sum1); + HWY_ASSERT_VEC_EQ(dw, f0, + ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1)); + HWY_ASSERT_VEC_EQ(dw, f0, sum1); + + // delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1. + auto delta_w = AllocateAligned(NN); + for (size_t i = 0; i < NN; ++i) { + delta_w[i] = TW{0}; + } + for (size_t p = 0; p < NN; ++p) { + delta_w[p] = TW{1}; + const VW delta0 = Load(dw, delta_w.get()); + const VW delta1 = Load(dw, delta_w.get() + NN / 2); + delta_w[p] = TW{0}; + const VN delta = ReorderDemote2To(dn, delta0, delta1); + + { + sum1 = f0; + const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1); + HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + // Swapped arg order + { + sum1 = f0; + const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1); + HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + // Start with nonzero sum0 or sum1 + { + VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta)); + sum1 = PromoteTo(dw, UpperHalf(dnh, delta)); + sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1); + HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + // Start with nonzero sum0 or sum1, and swap arg order + { + VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta)); + sum1 = PromoteTo(dw, UpperHalf(dnh, delta)); + sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1); + HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + } + } +}; + +HWY_NOINLINE void TestAllReorderWidenMulAccumulate() { + ForShrinkableVectors()(bfloat16_t()); + ForShrinkableVectors()(int16_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMulTest); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMul); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulHigh); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulFixedPoint15); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulEven); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulAdd); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllReorderWidenMulAccumulate); +} // namespace hwy + +#endif diff --git a/hwy/tests/reduction_test.cc b/hwy/tests/reduction_test.cc new file mode 100644 index 0000000..5e39abc --- /dev/null +++ b/hwy/tests/reduction_test.cc @@ -0,0 +1,227 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/reduction_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestSumOfLanes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned(N); + + // Lane i = bit i, higher lanes 0 + double sum = 0.0; + // Avoid setting sign bit and cap at double precision + constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = i < kBits ? static_cast(1ull << i) : 0; + sum += static_cast(in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), + SumOfLanes(d, Load(d, in_lanes.get()))); + + // Lane i = i (iota) to include upper lanes + sum = 0.0; + for (size_t i = 0; i < N; ++i) { + sum += static_cast(i); + } + HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0))); + } +}; + +HWY_NOINLINE void TestAllSumOfLanes() { + ForUIF3264(ForPartialVectors()); + ForUI16(ForPartialVectors()); +} + +struct TestMinOfLanes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned(N); + + // Lane i = bit i, higher lanes = 2 (not the minimum) + T min = HighestValue(); + // Avoid setting sign bit and cap at double precision + constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = i < kBits ? static_cast(1ull << i) : 2; + min = HWY_MIN(min, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); + + // Lane i = N - i to include upper lanes + min = HighestValue(); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = static_cast(N - i); // no 8-bit T so no wraparound + min = HWY_MIN(min, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); + + // Bug #910: also check negative values + min = HighestValue(); + const T input_copy[] = {static_cast(-1), + static_cast(-2), + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14}; + size_t i = 0; + for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { + in_lanes[i] = input_copy[i]; + min = HWY_MIN(min, input_copy[i]); + } + // Pad with neutral element to full vector (so we can load) + for (; i < N; ++i) { + in_lanes[i] = min; + } + HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); + } +}; + +struct TestMaxOfLanes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned(N); + + T max = LowestValue(); + // Avoid setting sign bit and cap at double precision + constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = i < kBits ? static_cast(1ull << i) : 0; + max = HWY_MAX(max, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); + + // Lane i = i to include upper lanes + max = LowestValue(); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = static_cast(i); // no 8-bit T so no wraparound + max = HWY_MAX(max, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); + + // Bug #910: also check negative values + max = LowestValue(); + const T input_copy[] = {static_cast(-1), + static_cast(-2), + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14}; + size_t i = 0; + for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { + in_lanes[i] = input_copy[i]; + max = HWY_MAX(max, in_lanes[i]); + } + // Pad with neutral element to full vector (so we can load) + for (; i < N; ++i) { + in_lanes[i] = max; + } + HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); + } +}; + +HWY_NOINLINE void TestAllMinMaxOfLanes() { + const ForPartialVectors test_min; + const ForPartialVectors test_max; + ForUIF3264(test_min); + ForUIF3264(test_max); + ForUI16(test_min); + ForUI16(test_max); +} + +struct TestSumsOf8 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + const size_t N = Lanes(d); + if (N < 8) return; + const Repartition du64; + + auto in_lanes = AllocateAligned(N); + auto sum_lanes = AllocateAligned(N / 8); + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = Random64(&rng) & 0xFF; + } + + for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) { + uint64_t sum = 0; + for (size_t i = 0; i < 8; ++i) { + sum += in_lanes[idx_sum * 8 + i]; + } + sum_lanes[idx_sum] = sum; + } + + const Vec in = Load(d, in_lanes.get()); + HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in)); + } + } +}; + +HWY_NOINLINE void TestAllSumsOf8() { + ForGEVectors<64, TestSumsOf8>()(uint8_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyReductionTest); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8); +} // namespace hwy + +#endif diff --git a/hwy/tests/reverse_test.cc b/hwy/tests/reverse_test.cc new file mode 100644 index 0000000..fcbcb7f --- /dev/null +++ b/hwy/tests/reverse_test.cc @@ -0,0 +1,176 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/reverse_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestReverse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned(N); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[N - 1 - i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse(d, v)); + } +}; + +struct TestReverse2 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned(N); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[i ^ 1]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v)); + } +}; + +struct TestReverse4 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned(N); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[i ^ 3]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v)); + } +}; + +struct TestReverse8 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned(N); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[i ^ 7]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v)); + } +}; + +HWY_NOINLINE void TestAllReverse() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF163264(ForPartialVectors()); +} + +HWY_NOINLINE void TestAllReverse2() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF64(ForGEVectors<128, TestReverse2>()); + ForUIF32(ForGEVectors<64, TestReverse2>()); + ForUIF16(ForGEVectors<32, TestReverse2>()); +} + +HWY_NOINLINE void TestAllReverse4() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF64(ForGEVectors<256, TestReverse4>()); + ForUIF32(ForGEVectors<128, TestReverse4>()); + ForUIF16(ForGEVectors<64, TestReverse4>()); +} + +HWY_NOINLINE void TestAllReverse8() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF64(ForGEVectors<512, TestReverse8>()); + ForUIF32(ForGEVectors<256, TestReverse8>()); + ForUIF16(ForGEVectors<128, TestReverse8>()); +} + +struct TestReverseBlocks { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned(N); + + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + const size_t num_blocks = N / kLanesPerBlock; + HWY_ASSERT(num_blocks != 0); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + const size_t idx_block = i / kLanesPerBlock; + const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock; + expected[i] = copy[base + (i % kLanesPerBlock)]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v)); + } +}; + +HWY_NOINLINE void TestAllReverseBlocks() { + ForAllTypes(ForGEVectors<128, TestReverseBlocks>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyReverseTest); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse2); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse4); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse8); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverseBlocks); +} // namespace hwy + +#endif diff --git a/hwy/tests/shift_test.cc b/hwy/tests/shift_test.cc new file mode 100644 index 0000000..585eba7 --- /dev/null +++ b/hwy/tests/shift_test.cc @@ -0,0 +1,428 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/shift_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +struct TestLeftShifts { + template + HWY_NOINLINE void operator()(T t, D d) { + if (kSigned) { + // Also test positive values + TestLeftShifts()(t, d); + } + + using TI = MakeSigned; + using TU = MakeUnsigned; + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + // Values to shift + const auto values = Iota(d, static_cast(kSigned ? -TI(N) : TI(0))); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); + + // 1 + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(T(i) - T(N)) : T(i); + expected[i] = T(TU(value) << 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); + + // max + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(T(i) - T(N)) : T(i); + expected[i] = T(TU(value) << kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); + } +}; + +template +struct TestVariableLeftShifts { + template + HWY_NOINLINE void operator()(T t, D d) { + if (kSigned) { + // Also test positive values + TestVariableLeftShifts()(t, d); + } + + using TI = MakeSigned; + using TU = MakeUnsigned; + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + const auto v0 = Zero(d); + const auto v1 = Set(d, 1); + const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift + + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + const auto max_shift = Set(d, kMaxShift); + const auto small_shifts = And(Iota(d, 0), max_shift); + const auto large_shifts = max_shift - small_shifts; + + // Same: 0 + HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0)); + + // Same: 1 + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1)); + + // Same: max + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift)); + + // Variable: small + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << (i & kMaxShift)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts)); + + // Variable: large + for (size_t i = 0; i < N; ++i) { + expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift))); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts)); + } +}; + +struct TestUnsignedRightShifts { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + const auto values = Iota(d, 0); + + const T kMax = LimitsMax(); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); + + // max + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift)); + } +}; + +struct TestRotateRight { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + constexpr size_t kBits = sizeof(T) * 8; + const auto mask_shift = Set(d, T{kBits}); + // Cover as many bit positions as possible to test shifting out + const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift)); + + // Rotate by 0 + HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values)); + + // Rotate by 1 + Store(values, d, expected.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values)); + + // Rotate by half + Store(values, d, expected.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight(values)); + + // Rotate by max + Store(values, d, expected.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight(values)); + } +}; + +struct TestVariableUnsignedRightShifts { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + const auto v0 = Zero(d); + const auto v1 = Set(d, 1); + const auto values = Iota(d, 0); + + const T kMax = LimitsMax(); + const auto max = Set(d, kMax); + + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + const auto max_shift = Set(d, kMaxShift); + const auto small_shifts = And(Iota(d, 0), max_shift); + const auto large_shifts = max_shift - small_shifts; + + // Same: 0 + HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0)); + + // Same: 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1)); + + // Same: max + HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift)); + + // Variable: small + for (size_t i = 0; i < N; ++i) { + expected[i] = T(i) >> (i & kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts)); + + // Variable: Large + for (size_t i = 0; i < N; ++i) { + expected[i] = kMax >> (kMaxShift - (i & kMaxShift)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts)); + } +}; + +template +T RightShiftNegative(T val) { + // C++ shifts are implementation-defined for negative numbers, and we have + // seen divisions replaced with shifts, so resort to bit operations. + using TU = hwy::MakeUnsigned; + TU bits; + CopySameSize(&val, &bits); + + const TU shifted = TU(bits >> kAmount); + + const TU all = TU(~TU(0)); + const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount; + const TU sign_extended = static_cast((all << num_zero) & LimitsMax()); + + bits = shifted | sign_extended; + CopySameSize(&bits, &val); + return val; +} + +class TestSignedRightShifts { + public: + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + constexpr T kMin = LimitsMin(); + constexpr T kMax = LimitsMax(); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // First test positive values, negative are checked below. + const auto v0 = Zero(d); + const auto values = And(Iota(d, 0), Set(d, kMax)); + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); + + // max + HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(values)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); + + // Even negative value + Test<0>(kMin, d, __LINE__); + Test<1>(kMin, d, __LINE__); + Test<2>(kMin, d, __LINE__); + Test(kMin, d, __LINE__); + + const T odd = static_cast(kMin + 1); + Test<0>(odd, d, __LINE__); + Test<1>(odd, d, __LINE__); + Test<2>(odd, d, __LINE__); + Test(odd, d, __LINE__); + } + + private: + template + void Test(T val, D d, int line) { + const auto expected = Set(d, RightShiftNegative(val)); + const auto in = Set(d, val); + const char* file = __FILE__; + AssertVecEqual(d, expected, ShiftRight(in), file, line); + AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line); + } +}; + +struct TestVariableSignedRightShifts { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TU = MakeUnsigned; + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + constexpr T kMin = LimitsMin(); + constexpr T kMax = LimitsMax(); + + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // First test positive values, negative are checked below. + const auto v0 = Zero(d); + const auto positive = Iota(d, 0) & Set(d, kMax); + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive)); + HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1)); + + // max + HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(positive)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift)); + + const auto max_shift = Set(d, kMaxShift); + const auto small_shifts = And(Iota(d, 0), max_shift); + const auto large_shifts = max_shift - small_shifts; + + const auto negative = Iota(d, kMin); + + // Test varying negative to shift + for (size_t i = 0; i < N; ++i) { + expected[i] = RightShiftNegative<1>(static_cast(kMin + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1))); + + // Shift MSB right by small amounts + for (size_t i = 0; i < N; ++i) { + const size_t amount = i & kMaxShift; + const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); + CopySameSize(&shifted, &expected[i]); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts)); + + // Shift MSB right by large amounts + for (size_t i = 0; i < N; ++i) { + const size_t amount = kMaxShift - (i & kMaxShift); + const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); + CopySameSize(&shifted, &expected[i]); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts)); + } +}; + +HWY_NOINLINE void TestAllShifts() { + ForUnsignedTypes(ForPartialVectors>()); + ForSignedTypes(ForPartialVectors>()); + ForUnsignedTypes(ForPartialVectors()); + ForSignedTypes(ForPartialVectors()); +} + +HWY_NOINLINE void TestAllVariableShifts() { + const ForPartialVectors> shl_u; + const ForPartialVectors> shl_s; + const ForPartialVectors shr_u; + const ForPartialVectors shr_s; + + shl_u(uint16_t()); + shr_u(uint16_t()); + + shl_u(uint32_t()); + shr_u(uint32_t()); + + shl_s(int16_t()); + shr_s(int16_t()); + + shl_s(int32_t()); + shr_s(int32_t()); + +#if HWY_HAVE_INTEGER64 + shl_u(uint64_t()); + shr_u(uint64_t()); + + shl_s(int64_t()); + shr_s(int64_t()); +#endif +} + +HWY_NOINLINE void TestAllRotateRight() { + const ForPartialVectors test; + test(uint32_t()); +#if HWY_HAVE_INTEGER64 + test(uint64_t()); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyShiftTest); +HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts); +HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts); +HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight); +} // namespace hwy + +#endif diff --git a/hwy/tests/swizzle_test.cc b/hwy/tests/swizzle_test.cc new file mode 100644 index 0000000..f447f7a --- /dev/null +++ b/hwy/tests/swizzle_test.cc @@ -0,0 +1,272 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // memset + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestGetLane { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(1)); + HWY_ASSERT_EQ(T(1), GetLane(v)); + } +}; + +HWY_NOINLINE void TestAllGetLane() { + ForAllTypes(ForPartialVectors()); +} + +struct TestExtractLane { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(1)); + for (size_t i = 0; i < Lanes(d); ++i) { + const T actual = ExtractLane(v, i); + HWY_ASSERT_EQ(static_cast(i + 1), actual); + } + } +}; + +HWY_NOINLINE void TestAllExtractLane() { + ForAllTypes(ForPartialVectors()); +} + +struct TestInsertLane { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec; + const V v = Iota(d, T(1)); + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + Store(v, d, lanes.get()); + + for (size_t i = 0; i < Lanes(d); ++i) { + lanes[i] = T{0}; + const V actual = InsertLane(v, i, static_cast(i + 1)); + HWY_ASSERT_VEC_EQ(d, v, actual); + Store(v, d, lanes.get()); // restore lane i + } + } +}; + +HWY_NOINLINE void TestAllInsertLane() { + ForAllTypes(ForPartialVectors()); +} + +struct TestDupEven { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((static_cast(i) & ~1) + 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1))); + } +}; + +HWY_NOINLINE void TestAllDupEven() { + ForUIF3264(ForShrinkableVectors()); +} + +struct TestDupOdd { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast((static_cast(i) & ~1) + 2); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1))); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllDupOdd() { + ForUIF3264(ForShrinkableVectors()); +} + +struct TestOddEven { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto even = Iota(d, 1); + const auto odd = Iota(d, static_cast(1 + N)); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast(1 + i + ((i & 1) ? N : 0)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even)); + } +}; + +HWY_NOINLINE void TestAllOddEven() { + ForAllTypes(ForShrinkableVectors()); +} + +struct TestOddEvenBlocks { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto even = Iota(d, 1); + const auto odd = Iota(d, static_cast(1 + N)); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + const size_t idx_block = i / (16 / sizeof(T)); + expected[i] = static_cast(1 + i + ((idx_block & 1) ? N : 0)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), OddEvenBlocks(odd, even)); + } +}; + +HWY_NOINLINE void TestAllOddEvenBlocks() { + ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>()); +} + +struct TestSwapAdjacentBlocks { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + if (N < 2 * kLanesPerBlock) return; + const auto vi = Iota(d, 1); + auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { + const size_t idx_block = i / kLanesPerBlock; + const size_t base = (idx_block ^ 1) * kLanesPerBlock; + const size_t mod = i % kLanesPerBlock; + expected[i] = static_cast(1 + base + mod); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), SwapAdjacentBlocks(vi)); + } +}; + +HWY_NOINLINE void TestAllSwapAdjacentBlocks() { + ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>()); +} + +struct TestTableLookupLanes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const RebindToSigned di; + using TI = TFromD; +#if HWY_TARGET != HWY_SCALAR + const size_t N = Lanes(d); + auto idx = AllocateAligned(N); + memset(idx.get(), 0, N * sizeof(TI)); + auto expected = AllocateAligned(N); + const auto v = Iota(d, 1); + + if (N <= 8) { // Test all permutations + for (size_t i0 = 0; i0 < N; ++i0) { + idx[0] = static_cast(i0); + + for (size_t i1 = 0; i1 < N; ++i1) { + if (N >= 2) idx[1] = static_cast(i1); + for (size_t i2 = 0; i2 < N; ++i2) { + if (N >= 4) idx[2] = static_cast(i2); + for (size_t i3 = 0; i3 < N; ++i3) { + if (N >= 4) idx[3] = static_cast(i3); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast(idx[i] + 1); // == v[idx[i]] + } + + const auto opaque1 = IndicesFromVec(d, Load(di, idx.get())); + const auto actual1 = TableLookupLanes(v, opaque1); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual1); + + const auto opaque2 = SetTableIndices(d, idx.get()); + const auto actual2 = TableLookupLanes(v, opaque2); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual2); + } + } + } + } + } else { + // Too many permutations to test exhaustively; choose one with repeated + // and cross-block indices and ensure indices do not exceed #lanes. + // For larger vectors, upper lanes will be zero. + HWY_ALIGN TI idx_source[16] = {1, 3, 2, 2, 8, 1, 7, 6, + 15, 14, 14, 15, 4, 9, 8, 5}; + for (size_t i = 0; i < N; ++i) { + idx[i] = (i < 16) ? idx_source[i] : 0; + // Avoid undefined results / asan error for scalar by capping indices. + if (idx[i] >= static_cast(N)) { + idx[i] = static_cast(N - 1); + } + expected[i] = static_cast(idx[i] + 1); // == v[idx[i]] + } + + const auto opaque1 = IndicesFromVec(d, Load(di, idx.get())); + const auto actual1 = TableLookupLanes(v, opaque1); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual1); + + const auto opaque2 = SetTableIndices(d, idx.get()); + const auto actual2 = TableLookupLanes(v, opaque2); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual2); + } +#else + const TI index = 0; + const auto v = Set(d, 1); + const auto opaque1 = SetTableIndices(d, &index); + HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque1)); + const auto opaque2 = IndicesFromVec(d, Zero(di)); + HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque2)); +#endif + } +}; + +HWY_NOINLINE void TestAllTableLookupLanes() { + ForUIF3264(ForPartialVectors()); +} + + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwySwizzleTest); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllExtractLane); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInsertLane); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes); +} // namespace hwy + +#endif diff --git a/hwy/tests/test_util-inl.h b/hwy/tests/test_util-inl.h new file mode 100644 index 0000000..d9c1aeb --- /dev/null +++ b/hwy/tests/test_util-inl.h @@ -0,0 +1,665 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Target-specific helper functions for use by *_test.cc. + +#include + +#include "hwy/base.h" +#include "hwy/tests/hwy_gtest.h" +#include "hwy/tests/test_util.h" + +// After test_util (also includes highway.h) +#include "hwy/print-inl.h" + +// Per-target include guard +#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_ +#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_ +#else +#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_ +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Compare expected vector to vector. +// HWY_INLINE works around a Clang SVE compiler bug where all but the first +// 128 bits (the NEON register) of actual are zero. +template , class V = Vec> +HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg actual, + const char* filename, const int line) { + const size_t N = Lanes(d); + auto actual_lanes = AllocateAligned(N); + Store(actual, d, actual_lanes.get()); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N, + target_name, filename, line); +} + +// Compare expected lanes to vector. +// HWY_INLINE works around a Clang SVE compiler bug where all but the first +// 128 bits (the NEON register) of actual are zero. +template , class V = Vec> +HWY_INLINE void AssertVecEqual(D d, VecArg expected, VecArg actual, + const char* filename, int line) { + auto expected_lanes = AllocateAligned(Lanes(d)); + Store(expected, d, expected_lanes.get()); + AssertVecEqual(d, expected_lanes.get(), actual, filename, line); +} + +// Only checks the valid mask elements (those whose index < Lanes(d)). +template +HWY_NOINLINE void AssertMaskEqual(D d, VecArg> a, VecArg> b, + const char* filename, int line) { + // lvalues prevented MSAN failure in farm_sve. + const Vec va = VecFromMask(d, a); + const Vec vb = VecFromMask(d, b); + AssertVecEqual(d, va, vb, filename, line); + + const char* target_name = hwy::TargetName(HWY_TARGET); + AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line); + AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line); + AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line); + + const size_t N = Lanes(d); +#if HWY_TARGET == HWY_SCALAR + const Rebind d8; +#else + const Repartition d8; +#endif + const size_t N8 = Lanes(d8); + auto bits_a = AllocateAligned(HWY_MAX(8, N8)); + auto bits_b = AllocateAligned(HWY_MAX(8, N8)); + memset(bits_a.get(), 0, N8); + memset(bits_b.get(), 0, N8); + const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get()); + const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get()); + AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line); + size_t i = 0; + // First check whole bytes (if that many elements are still valid) + for (; i < N / 8; ++i) { + if (bits_a[i] != bits_b[i]) { + fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast(i), + bits_a[i], bits_b[i]); + Print(d8, "expect", Load(d8, bits_a.get()), 0, N8); + Print(d8, "actual", Load(d8, bits_b.get()), 0, N8); + hwy::Abort(filename, line, "Masks not equal"); + } + } + // Then the valid bit(s) in the last byte. + const size_t remainder = N % 8; + if (remainder != 0) { + const int mask = (1 << remainder) - 1; + const int valid_a = bits_a[i] & mask; + const int valid_b = bits_b[i] & mask; + if (valid_a != valid_b) { + fprintf(stderr, "Mismatch in last byte %d: %d != %d\n", + static_cast(i), valid_a, valid_b); + Print(d8, "expect", Load(d8, bits_a.get()), 0, N8); + Print(d8, "actual", Load(d8, bits_b.get()), 0, N8); + hwy::Abort(filename, line, "Masks not equal"); + } + } +} + +// Only sets valid elements (those whose index < Lanes(d)). This helps catch +// tests that are not masking off the (undefined) upper mask elements. +// +// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks. +template +HWY_INLINE Mask MaskTrue(const D d) { + return FirstN(d, Lanes(d)); +} + +template +HWY_INLINE Mask MaskFalse(const D d) { + const auto zero = Zero(RebindToSigned()); + return RebindMask(d, Lt(zero, zero)); +} + +#ifndef HWY_ASSERT_EQ + +#define HWY_ASSERT_EQ(expected, actual) \ + hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \ + __LINE__) + +#define HWY_ASSERT_ARRAY_EQ(expected, actual, count) \ + hwy::AssertArrayEqual(expected, actual, count, hwy::TargetName(HWY_TARGET), \ + __FILE__, __LINE__) + +#define HWY_ASSERT_STRING_EQ(expected, actual) \ + hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \ + __FILE__, __LINE__) + +#define HWY_ASSERT_VEC_EQ(d, expected, actual) \ + AssertVecEqual(d, expected, actual, __FILE__, __LINE__) + +#define HWY_ASSERT_MASK_EQ(d, expected, actual) \ + AssertMaskEqual(d, expected, actual, __FILE__, __LINE__) + +#endif // HWY_ASSERT_EQ + +namespace detail { + +// Helpers for instantiating tests with combinations of lane types / counts. + +// Calls Test for each CappedTag where N is in [kMinLanes, kMul * kMinArg] +// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound +// is required to ensure capped vectors remain extendable. Implemented by +// recursively halving kMul until it is zero. +template +struct ForeachCappedR { + static void Do(size_t min_lanes, size_t max_lanes) { + const CappedTag d; + + // If we already don't have enough lanes, stop. + const size_t lanes = Lanes(d); + if (lanes < min_lanes) return; + + if (lanes <= max_lanes) { + Test()(T(), d); + } + ForeachCappedR::Do(min_lanes, max_lanes); + } +}; + +// Base case to stop the recursion. +template +struct ForeachCappedR { + static void Do(size_t, size_t) {} +}; + +#if HWY_HAVE_SCALABLE + +template +constexpr int MinPow2() { + // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded + // as kPow2 == -3). The fraction also must not result in zero lanes for the + // smallest possible vector size, which is 128 bits even on RISC-V (with the + // application processor profile). + return HWY_MAX(-3, -static_cast(CeilLog2(16 / sizeof(T)))); +} + +// Iterates kPow2 upward through +3. +template +struct ForeachShiftR { + static void Do(size_t min_lanes) { + const ScalableTag d; + + // Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum + // vector size, so we always have enough lanes, except ForGEVectors. + if (Lanes(d) >= min_lanes) { + Test()(T(), d); + } else { + fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n", + static_cast(Lanes(d)), static_cast(min_lanes), + static_cast(sizeof(T)), kPow2 + kAddPow2); + HWY_ASSERT(min_lanes != 1); + } + + ForeachShiftR::Do(min_lanes); + } +}; + +// Base case to stop the recursion. +template +struct ForeachShiftR { + static void Do(size_t) {} +}; +#else +// ForeachCappedR already handled all possible sizes. +#endif // HWY_HAVE_SCALABLE + +} // namespace detail + +// These 'adapters' call a test for all possible N or kPow2 subject to +// constraints such as "vectors must be extendable" or "vectors >= 128 bits". +// They may be called directly, or via For*Types. Note that for an adapter C, +// `C(T())` does not call the test - the correct invocation is +// `C()(T())`, or preferably `ForAllTypes(C())`. We check at runtime +// that operator() is called to prevent such bugs. Note that this is not +// thread-safe, but that is fine because C are typically local variables. + +// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for +// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR). +template +class ForExtendableVectors { + mutable bool called_ = false; + + public: + ~ForExtendableVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMaxCapped = HWY_LANES(T); + // Skip CappedTag that are already full vectors. + const size_t max_lanes = Lanes(ScalableTag()) >> kPow2; + (void)kMaxCapped; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + // not supported +#else + detail::ForeachCappedR> kPow2), 1, Test>::Do(1, max_lanes); +#if HWY_TARGET == HWY_RVV + // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR() + kPow2, -kPow2, Test>::Do(1); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR() + kPow2 + 3, -kPow2 - 3, + Test>::Do(1); +#endif +#endif // HWY_SCALAR + } +}; + +// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops +// that narrow their input, e.g. UpperHalf. +template +class ForShrinkableVectors { + mutable bool called_ = false; + + public: + ~ForShrinkableVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMinLanes = size_t{1} << kPow2; + constexpr size_t kMaxCapped = HWY_LANES(T); + // For shrinking, an upper limit is unnecessary. + constexpr size_t max_lanes = kMaxCapped; + + (void)kMinLanes; + (void)max_lanes; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + // not supported +#else + detail::ForeachCappedR> kPow2), kMinLanes, Test>::Do( + kMinLanes, max_lanes); +#if HWY_TARGET == HWY_RVV + // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR() + kPow2, 0, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR() + kPow2 + 3, -3, Test>::Do( + kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +// Calls Test for all supported power of two vectors of at least kMinBits. +// Examples: AES or 64x64 require 128 bits, casts may require 64 bits. +template +class ForGEVectors { + mutable bool called_ = false; + + public: + ~ForGEVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMaxCapped = HWY_LANES(T); + constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T); + // An upper limit is unnecessary. + constexpr size_t max_lanes = kMaxCapped; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + (void)kMinLanes; // not supported +#else + detail::ForeachCappedR::Do( + kMinLanes, max_lanes); +#if HWY_TARGET == HWY_RVV + // Can be 0 (handled below) if kMinBits > 64. + constexpr size_t kRatio = 128 / kMinBits; + constexpr int kMinPow2 = + kRatio == 0 ? 0 : -static_cast(CeilLog2(kRatio)); + // For each [kMinPow2, 3]; counter is [kMinPow2, 3]. + detail::ForeachShiftR::Do(kMinLanes); +#elif HWY_HAVE_SCALABLE + // Can be 0 (handled below) if kMinBits > 128. + constexpr size_t kRatio = 128 / kMinBits; + constexpr int kMinPow2 = + kRatio == 0 ? 0 : -static_cast(CeilLog2(kRatio)); + // For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3]. + detail::ForeachShiftR::Do(kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +template +using ForGE128Vectors = ForGEVectors<128, Test>; + +// Calls Test for all N that can be promoted (not the same as Extendable because +// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper. +template +class ForPromoteVectors { + mutable bool called_ = false; + + public: + ~ForPromoteVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kFactor = size_t{1} << kPow2; + static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), ""); + constexpr size_t kMaxCapped = HWY_LANES(T); + constexpr size_t kMinLanes = kFactor; + // Skip CappedTag that are already full vectors. + const size_t max_lanes = Lanes(ScalableTag()) >> kPow2; + (void)kMaxCapped; + (void)kMinLanes; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + detail::ForeachCappedR::Do(1, 1); +#else + // TODO(janwas): call Extendable if kMinLanes check not required? + detail::ForeachCappedR> kPow2), 1, Test>::Do(kMinLanes, + max_lanes); +#if HWY_TARGET == HWY_RVV + // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR() + kPow2, -kPow2, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR() + kPow2 + 3, -kPow2 - 3, + Test>::Do(kMinLanes); +#endif +#endif // HWY_SCALAR + } +}; + +// Calls Test for all N than can be demoted (not the same as Shrinkable because +// HWY_SCALAR has one lane). +template +class ForDemoteVectors { + mutable bool called_ = false; + + public: + ~ForDemoteVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMinLanes = size_t{1} << kPow2; + constexpr size_t kMaxCapped = HWY_LANES(T); + // For shrinking, an upper limit is unnecessary. + constexpr size_t max_lanes = kMaxCapped; + + (void)kMinLanes; + (void)max_lanes; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + detail::ForeachCappedR::Do(1, 1); +#else + detail::ForeachCappedR> kPow2), kMinLanes, Test>::Do( + kMinLanes, max_lanes); + +// TODO(janwas): call Extendable if kMinLanes check not required? +#if HWY_TARGET == HWY_RVV + // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR() + kPow2, 0, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR() + kPow2 + 3, -3, Test>::Do( + kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +// For LowerHalf/Quarter. +template +class ForHalfVectors { + mutable bool called_ = false; + + public: + ~ForHalfVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template + void operator()(T /*unused*/) const { + called_ = true; +#if HWY_TARGET == HWY_SCALAR + detail::ForeachCappedR::Do(1, 1); +#else + constexpr size_t kMinLanes = size_t{1} << kPow2; + // For shrinking, an upper limit is unnecessary. + constexpr size_t kMaxCapped = HWY_LANES(T); + detail::ForeachCappedR> kPow2), kMinLanes, Test>::Do( + kMinLanes, kMaxCapped); + +// TODO(janwas): call Extendable if kMinLanes check not required? +#if HWY_TARGET == HWY_RVV + // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR() + kPow2, 0, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR() + kPow2 + 3, -3, Test>::Do( + kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +// Calls Test for all power of two N in [1, Lanes(d)]. This is the default +// for ops that do not narrow nor widen their input, nor require 128 bits. +template +class ForPartialVectors { + mutable bool called_ = false; + + public: + ~ForPartialVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template + void operator()(T t) const { + called_ = true; +#if HWY_TARGET == HWY_SCALAR + (void)t; + detail::ForeachCappedR::Do(1, 1); +#else + ForExtendableVectors()(t); +#endif + } +}; + +// Type lists to shorten call sites: + +template +void ForSignedTypes(const Func& func) { + func(int8_t()); + func(int16_t()); + func(int32_t()); +#if HWY_HAVE_INTEGER64 + func(int64_t()); +#endif +} + +template +void ForUnsignedTypes(const Func& func) { + func(uint8_t()); + func(uint16_t()); + func(uint32_t()); +#if HWY_HAVE_INTEGER64 + func(uint64_t()); +#endif +} + +template +void ForIntegerTypes(const Func& func) { + ForSignedTypes(func); + ForUnsignedTypes(func); +} + +template +void ForFloatTypes(const Func& func) { + func(float()); +#if HWY_HAVE_FLOAT64 + func(double()); +#endif +} + +template +void ForAllTypes(const Func& func) { + ForIntegerTypes(func); + ForFloatTypes(func); +} + +template +void ForUI8(const Func& func) { + func(uint8_t()); + func(int8_t()); +} + +template +void ForUI16(const Func& func) { + func(uint16_t()); + func(int16_t()); +} + +template +void ForUIF16(const Func& func) { + ForUI16(func); +#if HWY_HAVE_FLOAT16 + func(float16_t()); +#endif +} + +template +void ForUI32(const Func& func) { + func(uint32_t()); + func(int32_t()); +} + +template +void ForUIF32(const Func& func) { + ForUI32(func); + func(float()); +} + +template +void ForUI64(const Func& func) { +#if HWY_HAVE_INTEGER64 + func(uint64_t()); + func(int64_t()); +#endif +} + +template +void ForUIF64(const Func& func) { + ForUI64(func); +#if HWY_HAVE_FLOAT64 + func(double()); +#endif +} + +template +void ForUI3264(const Func& func) { + ForUI32(func); + ForUI64(func); +} + +template +void ForUIF3264(const Func& func) { + ForUIF32(func); + ForUIF64(func); +} + +template +void ForUI163264(const Func& func) { + ForUI16(func); + ForUI3264(func); +} + +template +void ForUIF163264(const Func& func) { + ForUIF16(func); + ForUIF3264(func); +} + +// For tests that involve loops, adjust the trip count so that emulated tests +// finish quickly (but always at least 2 iterations to ensure some diversity). +constexpr size_t AdjustedReps(size_t max_reps) { +#if HWY_ARCH_RVV + return HWY_MAX(max_reps / 32, 2); +#elif HWY_IS_DEBUG_BUILD + return HWY_MAX(max_reps / 8, 2); +#elif HWY_ARCH_ARM + return HWY_MAX(max_reps / 4, 2); +#else + return HWY_MAX(max_reps, 2); +#endif +} + +// Same as above, but the loop trip count will be 1 << max_pow2. +constexpr size_t AdjustedLog2Reps(size_t max_pow2) { + // If "negative" (unsigned wraparound), use original. +#if HWY_ARCH_RVV + return HWY_MIN(max_pow2 - 4, max_pow2); +#elif HWY_IS_DEBUG_BUILD + return HWY_MIN(max_pow2 - 1, max_pow2); +#elif HWY_ARCH_ARM + return HWY_MIN(max_pow2 - 1, max_pow2); +#else + return max_pow2; +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // per-target include guard diff --git a/hwy/tests/test_util.cc b/hwy/tests/test_util.cc new file mode 100644 index 0000000..a0796b1 --- /dev/null +++ b/hwy/tests/test_util.cc @@ -0,0 +1,117 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/tests/test_util.h" + +#include +#include + +#include + +#include "hwy/base.h" +#include "hwy/print.h" + +namespace hwy { + +HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2, + const size_t size, size_t* pos) { + const uint8_t* bytes1 = reinterpret_cast(p1); + const uint8_t* bytes2 = reinterpret_cast(p2); + for (size_t i = 0; i < size; ++i) { + if (bytes1[i] != bytes2[i]) { + if (pos != nullptr) { + *pos = i; + } + return false; + } + } + return true; +} + +void AssertStringEqual(const char* expected, const char* actual, + const char* target_name, const char* filename, + int line) { + while (*expected == *actual++) { + if (*expected++ == '\0') return; + } + + Abort(filename, line, "%s string mismatch: expected '%s', got '%s'.\n", + target_name, expected, actual); +} + +namespace detail { + +HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr, + const void* actual_ptr) { + if (!info.is_float) { + return BytesEqual(expected_ptr, actual_ptr, info.sizeof_t); + } + + if (info.sizeof_t == 4) { + float expected, actual; + CopyBytes<4>(expected_ptr, &expected); + CopyBytes<4>(actual_ptr, &actual); + return ComputeUlpDelta(expected, actual) <= 1; + } else if (info.sizeof_t == 8) { + double expected, actual; + CopyBytes<8>(expected_ptr, &expected); + CopyBytes<8>(actual_ptr, &actual); + return ComputeUlpDelta(expected, actual) <= 1; + } else { + HWY_ABORT("Unexpected float size %d\n", static_cast(info.sizeof_t)); + return false; + } +} + +HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort( + const TypeInfo& info, const void* expected_ptr, const void* actual_ptr, + const char* target_name, const char* filename, int line, size_t lane, + size_t num_lanes) { + char type_name[100]; + TypeName(info, 1, type_name); + char expected_str[100]; + ToString(info, expected_ptr, expected_str); + char actual_str[100]; + ToString(info, actual_ptr, actual_str); + Abort(filename, line, + "%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name, + type_name, static_cast(num_lanes), static_cast(lane), + expected_str, actual_str); +} + +HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info, + const void* expected_void, + const void* actual_void, size_t N, + const char* target_name, + const char* filename, int line) { + const uint8_t* expected_array = + reinterpret_cast(expected_void); + const uint8_t* actual_array = reinterpret_cast(actual_void); + for (size_t i = 0; i < N; ++i) { + const void* expected_ptr = expected_array + i * info.sizeof_t; + const void* actual_ptr = actual_array + i * info.sizeof_t; + if (!IsEqual(info, expected_ptr, actual_ptr)) { + fprintf(stderr, "\n\n"); + PrintArray(info, "expect", expected_array, N, i); + PrintArray(info, "actual", actual_array, N, i); + + PrintMismatchAndAbort(info, expected_ptr, actual_ptr, target_name, + filename, line, i, N); + } + } +} + +} // namespace detail +} // namespace hwy diff --git a/hwy/tests/test_util.h b/hwy/tests/test_util.h new file mode 100644 index 0000000..459de96 --- /dev/null +++ b/hwy/tests/test_util.h @@ -0,0 +1,172 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HWY_TESTS_TEST_UTIL_H_ +#define HWY_TESTS_TEST_UTIL_H_ + +// Target-independent helper functions for use by *_test.cc. + +#include +#include +#include + +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/highway.h" +#include "hwy/highway_export.h" +#include "hwy/print.h" + +namespace hwy { + +// The maximum vector size used in tests when defining test data. DEPRECATED. +constexpr size_t kTestMaxVectorSize = 64; + +// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937, +// which triggers a compiler bug. +class RandomState { + public: + explicit RandomState(const uint64_t seed = 0x123456789ull) { + s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull); + s1_ = SplitMix64(s0_); + } + + HWY_INLINE uint64_t operator()() { + uint64_t s1 = s0_; + const uint64_t s0 = s1_; + const uint64_t bits = s1 + s0; + s0_ = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s1_ = s1; + return bits; + } + + private: + static uint64_t SplitMix64(uint64_t z) { + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + uint64_t s0_; + uint64_t s1_; +}; + +static HWY_INLINE uint32_t Random32(RandomState* rng) { + return static_cast((*rng)()); +} + +static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); } + +// Prevents the compiler from eliding the computations that led to "output". +// Works by indicating to the compiler that "output" is being read and modified. +// The +r constraint avoids unnecessary writes to memory, but only works for +// built-in types. +template +inline void PreventElision(T&& output) { +#if HWY_COMPILER_MSVC + (void)output; +#else // HWY_COMPILER_MSVC + asm volatile("" : "+r"(output) : : "memory"); +#endif // HWY_COMPILER_MSVC +} + +HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2, + const size_t size, size_t* pos = nullptr); + +void AssertStringEqual(const char* expected, const char* actual, + const char* target_name, const char* filename, int line); + +namespace detail { + +template > +TU ComputeUlpDelta(const T expected, const T actual) { + // Handle -0 == 0 and infinities. + if (expected == actual) return 0; + + // Consider "equal" if both are NaN, so we can verify an expected NaN. + // Needs a special case because there are many possible NaN representations. + if (std::isnan(expected) && std::isnan(actual)) return 0; + + // Compute the difference in units of last place. We do not need to check for + // differing signs; they will result in large differences, which is fine. + TU ux, uy; + CopySameSize(&expected, &ux); + CopySameSize(&actual, &uy); + + // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20. + const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy); + return ulp; +} + +HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr, + const void* actual_ptr); + +HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort( + const TypeInfo& info, const void* expected_ptr, const void* actual_ptr, + const char* target_name, const char* filename, int line, size_t lane = 0, + size_t num_lanes = 1); + +HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info, + const void* expected_void, + const void* actual_void, size_t N, + const char* target_name, + const char* filename, int line); + +} // namespace detail + +// Returns a name for the vector/part/scalar. The type prefix is u/i/f for +// unsigned/signed/floating point, followed by the number of bits per lane; +// then 'x' followed by the number of lanes. Example: u8x16. This is useful for +// understanding which instantiation of a generic test failed. +template +std::string TypeName(T /*unused*/, size_t N) { + char string100[100]; + detail::TypeName(detail::MakeTypeInfo(), N, string100); + return string100; +} + +// Compare non-vector, non-string T. +template +HWY_INLINE bool IsEqual(const T expected, const T actual) { + const auto info = detail::MakeTypeInfo(); + return detail::IsEqual(info, &expected, &actual); +} + +template +HWY_INLINE void AssertEqual(const T expected, const T actual, + const char* target_name, const char* filename, + int line, size_t lane = 0) { + const auto info = detail::MakeTypeInfo(); + if (!detail::IsEqual(info, &expected, &actual)) { + detail::PrintMismatchAndAbort(info, &expected, &actual, target_name, + filename, line, lane); + } +} + +template +HWY_INLINE void AssertArrayEqual(const T* expected, const T* actual, + size_t count, const char* target_name, + const char* filename, int line) { + const auto info = hwy::detail::MakeTypeInfo(); + detail::AssertArrayEqual(info, expected, actual, count, target_name, filename, + line); +} + +} // namespace hwy + +#endif // HWY_TESTS_TEST_UTIL_H_ diff --git a/hwy/tests/test_util_test.cc b/hwy/tests/test_util_test.cc new file mode 100644 index 0000000..d55e2e8 --- /dev/null +++ b/hwy/tests/test_util_test.cc @@ -0,0 +1,105 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/test_util_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestName { + template + HWY_NOINLINE void operator()(T t, D d) { + char num[10]; + std::string expected = IsFloat() ? "f" : (IsSigned() ? "i" : "u"); + snprintf(num, sizeof(num), "%u" , static_cast(sizeof(T) * 8)); + expected += num; + + const size_t N = Lanes(d); + if (N != 1) { + expected += 'x'; + snprintf(num, sizeof(num), "%u", static_cast(N)); + expected += num; + } + const std::string actual = TypeName(t, N); + if (expected != actual) { + HWY_ABORT("%s mismatch: expected '%s', got '%s'.\n", + hwy::TargetName(HWY_TARGET), expected.c_str(), actual.c_str()); + } + } +}; + +HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors()); } + +struct TestEqualInteger { + template + HWY_NOINLINE void operator()(T /*t*/) const { + HWY_ASSERT_EQ(T(0), T(0)); + HWY_ASSERT_EQ(T(1), T(1)); + HWY_ASSERT_EQ(T(-1), T(-1)); + HWY_ASSERT_EQ(LimitsMin(), LimitsMin()); + + HWY_ASSERT(!IsEqual(T(0), T(1))); + HWY_ASSERT(!IsEqual(T(1), T(0))); + HWY_ASSERT(!IsEqual(T(1), T(-1))); + HWY_ASSERT(!IsEqual(T(-1), T(1))); + HWY_ASSERT(!IsEqual(LimitsMin(), LimitsMax())); + HWY_ASSERT(!IsEqual(LimitsMax(), LimitsMin())); + } +}; + +struct TestEqualFloat { + template + HWY_NOINLINE void operator()(T /*t*/) const { + HWY_ASSERT(IsEqual(T(0), T(0))); + HWY_ASSERT(IsEqual(T(1), T(1))); + HWY_ASSERT(IsEqual(T(-1), T(-1))); + HWY_ASSERT(IsEqual(MantissaEnd(), MantissaEnd())); + + HWY_ASSERT(!IsEqual(T(0), T(1))); + HWY_ASSERT(!IsEqual(T(1), T(0))); + HWY_ASSERT(!IsEqual(T(1), T(-1))); + HWY_ASSERT(!IsEqual(T(-1), T(1))); + HWY_ASSERT(!IsEqual(LowestValue(), HighestValue())); + HWY_ASSERT(!IsEqual(HighestValue(), LowestValue())); + } +}; + +HWY_NOINLINE void TestAllEqual() { + ForIntegerTypes(TestEqualInteger()); + ForFloatTypes(TestEqualFloat()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(TestUtilTest); +HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName); +HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual); +} // namespace hwy + +#endif diff --git a/libhwy-contrib.pc.in b/libhwy-contrib.pc.in new file mode 100644 index 0000000..89c45f5 --- /dev/null +++ b/libhwy-contrib.pc.in @@ -0,0 +1,10 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: libhwy-contrib +Description: Additions to Highway: dot product, image, math, sort +Version: @HWY_LIBRARY_VERSION@ +Libs: -L${libdir} -lhwy_contrib +Cflags: -I${includedir} diff --git a/libhwy-test.pc.in b/libhwy-test.pc.in new file mode 100644 index 0000000..0416b10 --- /dev/null +++ b/libhwy-test.pc.in @@ -0,0 +1,11 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: libhwy-test +Description: Efficient and performance-portable SIMD wrapper, test helpers. +Requires: gtest +Version: @HWY_LIBRARY_VERSION@ +Libs: -L${libdir} -lhwy_test +Cflags: -I${includedir} diff --git a/libhwy.pc.in b/libhwy.pc.in new file mode 100644 index 0000000..6439892 --- /dev/null +++ b/libhwy.pc.in @@ -0,0 +1,10 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: libhwy +Description: Efficient and performance-portable SIMD wrapper +Version: @HWY_LIBRARY_VERSION@ +Libs: -L${libdir} -lhwy +Cflags: -I${includedir} -D@DLLEXPORT_TO_DEFINE@ diff --git a/preamble.js.lds b/preamble.js.lds new file mode 100644 index 0000000..f484a19 --- /dev/null +++ b/preamble.js.lds @@ -0,0 +1,9 @@ +/* + * Copyright 2019 Google LLC + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* mock crypto module for benchmarks and unit tests or std::random_device fails at runtime */ +var crypto = { getRandomValues: function(array) { for (var i = 0; i < array.length; i++) array[i] = (Math.random()*256)|0 } }; \ No newline at end of file diff --git a/run_tests.bat b/run_tests.bat new file mode 100644 index 0000000..26600a2 --- /dev/null +++ b/run_tests.bat @@ -0,0 +1,20 @@ +@echo off +REM Switch directory of this batch file +cd %~dp0 + +if not exist build_win mkdir build_win + +cd build_win +cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -G Ninja || goto error +ninja || goto error +ctest -j || goto error + +cd .. +echo Success +goto end + +:error +echo Failure +exit /b 1 + +:end diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 0000000..7f7d344 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Switch to directory of this script +MYDIR=$(dirname $(realpath "$0")) +cd "${MYDIR}" + +# Exit if anything fails +set -e + +####################################### +echo RELEASE +rm -rf build +mkdir build +cd build +cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON +make -j +ctest -j +cd .. +rm -rf build + +####################################### +echo DEBUG Clang 9 +rm -rf build_dbg +mkdir build_dbg +cd build_dbg +CXX=clang++-9 CC=clang-9 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug +make -j +ctest -j +cd .. +rm -rf build_dbg + +####################################### +echo 32-bit GCC +rm -rf build_32 +mkdir build_32 +cd build_32 +CFLAGS=-m32 CXXFLAGS=-m32 LDFLAGS=-m32 CXX=g++ CC=gcc cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON +make -j +ctest -j +cd .. +rm -rf build_32 + +####################################### +for VER in 10 11 12; do + echo GCC $VER + rm -rf build_g$VER + mkdir build_g$VER + cd build_g$VER + CC=gcc-$VER CXX=g++-$VER cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON + make -j + make test + cd .. + rm -rf build_g$VER +done + +####################################### +echo ARMv7 GCC +export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf +rm -rf build_arm7 +mkdir build_arm7 +cd build_arm7 +CC=arm-linux-gnueabihf-gcc-11 CXX=arm-linux-gnueabihf-g++-11 cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON +make -j8 +ctest +cd .. +rm -rf build_arm7 + +####################################### +echo ARMv8 GCC +export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu +rm -rf build_arm8 +mkdir build_arm8 +cd build_arm8 +CC=aarch64-linux-gnu-gcc-11 CXX=aarch64-linux-gnu-g++-11 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON +make -j8 +ctest +cd .. +rm -rf build_arm8 + +echo Success -- 2.30.2

iN zT&I088s z$13oW5s*ir_OAKqq2@x*QiQJ3e98W?dXHM5@3iGd0OWdu=~}l4uGy6u2}kj-Du2SW zu2Fz2>y!QU=zg`&H0^YEQ)^+Y zNa@m@<&;ojoPp>~F(>;a+*c8u%&ZCb^%b2+d%E;064yh#8imETge-L1f@8doa$`9r zB62@2l}a0ORGs(-uRPW^r>I-aeH+DbFO(5N;DhdSp8VB|c?3)xUC7{$Fv>CdSDLA) z-L9_Ze>-U|#;9a~<6&*w4ZU;kOt|VR_cf4!2CLXRSb^eEF(Nw zJci^Sp{lT4u^sfbcMar;RFU&zBvynPx1M)R$Q?&G{{ZV&9CtFp#1IPs&OeLkN-wSD zB>l+;1P+JmNyHO1ZnXP*uQ{#mHpg(EG9E_+5=Wups-!SOu`Trts-ten5m*pDgi`6j zO3gXg(VvwSjzK-?XSIuIX>18!Seb@DQ(gF2$}w&AIPnsUotHtT2}>5YYiB;F#Si3o z{c5$%z2O_=NfeHbqj3ExM0Qs8=<2sVV?N(4fZ%$o3={Z@Nm}P->v@@%Ir+v&uC*rB z(>a`Cn1Wv|E#rO+V6$a;J-^1a5>-#%!-vXXfSV`+9|lpVw^;en6q-Z}@k{8kdFT4PiFxQb~JBWXx^!{^|D> z?EV??wwvO+MteDS-9|rqZy^QaKD>S9_Z8rBi^zUQu~cSVzya4+hj21a6uukrG(IEHR`DVc-9n&UTZYcu_XGQyf<`N*Fi~0)&2tgY+6aQw-_e9Gs2oB_M;@W@&s688F_y3`T<;h-IUh% zHY`zNNo9?2e)AlHN%pUE(XEAsk{BpcC(Oiktx{DIZ1V9CbZWS*gtM|uHFj9Uk~xu5 zR(U>bcO?3Z@C9iO7w;DLJl1K|tZZhswVve#)y@QQpY@U-U8DK*t199kN0z9&iNbT7 zp1)JxwTeku9L?Nq)zVm5NpU+djwRge22MVLs?93gLL`n-sLPQc;~$+#4AX9r=Rl6m zb}BJo5Pj3nLPsXE^${?)ia1ej0bsKIqCyKMc{u}4ZAx?J$PDvaafl7ezh!Jqr;w=Z9Yu5#tFvU|G!03%u>8M`Z$MFSc3q!rcpYedrL zyK}7HTgMV0Aw1wW6V3?-w{R&+N-FHmjVD%0DCIGpl)F!%{HxgEvG8mt76GL@4(%Xv ze-hNUms)HVQcDG;gxdij5mP4}g>$@8y^=julzX?8Me@|~y{FL8_*Z{%sp*=9kbkqo z9$RXwu*#ayg40Nr813&Z(Z)VWRYu|Mp5l+J+jL6Gl)egyo?k{Q)Stq7DL7c+bv-cU zTlv&q@Pz2{9i|(I$>`C_2>o+SVxiSCUQb%(I*QLww(#}==#-TK1^bVW<0G2B% z^WmPe5oWY_E>sMFtW}TT-{V&r^xspSrdx@pyl?1ni?{fP;w!T7X1Qx+D_KhAQc?=D zbPexV5_oS|h@IMW=V`&(yRrFJo`d17Qs(JpdpRMujn$=j6KTeJ0BZooKsmooX!K?g zrBW|c*U0qH)ZqM7Is7Qyf(Ja(HYMFi8|EKcjJFg6^9lgwWmz05MA4`?JpTX}u&5MC zZ+5B(C4kOHAO5Pm+I$GVx{=tZ3_yG-AB|UW6opz6TMRP$v-o{0pO?9t)O4|`F_u{w z3m#WIbKF!)k-e#C-9v8LMmZzauiVDcBEH?-iwq65S>=Y!fmtPC-9lt%+?sY_p;|3H z*^3E**h4S}B>PcwBzDqy)2k{e`B{%7)RAgYNF->IM;ZO%U>=>b+M;$dW#mK=AGn8- z2c;Z|ZWq4868G(JtK0m@jn5b$Q^oXcE1_mA!yx0Q)~SY`9Yb_xDB~P#W*I-@SN_Q* z<;YZt(RT3c04F2cwIv(qg7(~%OUlX>)Q0EfJfCcueeiUKZSzMOtVK?FB$2?Xwzk(A zTO)Z-aRTHwl-mq{KzTGzsDEf&FPekw;9(f#IrQDyx@qCt)tO4FTON48hB+8L#YpoO zw~i6fNk56J7pgwdBCj}d2OVmO7&xt_tYn?djRx3VUn?F&AhG_HyKlW_nn1yu2cF+b zso}d)wlYjk(ILSd(3c56e>TIz)(29BfLQcw>ypEkZj9v|h}c^$o}q+{t%0bkj@`Wj~{ za7W;2^wb9|jJyt>hLM{+skD_XyI3rX5kn)vETCl*8}Za+l}sX&01EmEN^+J-@`T3(#deK@?4BBCjbG! z9CWLfOp&q)5!OxL>l2~+)9ltbrIufnow1M)PBZ;#rl%}w-*~GLovq%rru4CtqFiobKH!*yax zt_V577^Nw@S?p4Zz1HT(f!fyk?(*Zr`mM?&J2t?I15QqQzA^y7;B(D-CW<^gtm{_s z*y!yQqV5Dp?xtUq_Rmg-Ca}CQr`boMTls2?7A?)(mm?&c_pNUV>JdlcFtR$NXs!-h z1RmeYyxCN4T0IWf!6@o_1*eC6IyWS`GC*<{JKSXU$2G=$N6;<2HKi`Or^hSVq>CNQ zDEN{!ZUZZK8NeX=^{rdK6ARd_Ez(>#yGN5B!iwGgrnhwwZ>VT@)_Q|0T3o~;b@U8z zpQz4i;Z>)&rAaf%{5|45Hp|3XcB2)N-XIOB9xS%F7F#!Q;sX9Opf19~1bJ`&YUVOr`}fM$)c% zb||Fh+b5-6dVI|^O$k{vZEITl+^gGKovOoZlIM~=2en`y}wEN-;+JLEqm*wp8aI300Yy4}X5e)fwctHQAEP;f^6dh=ZU zhl`?o<;T{PNr#}IksSj!v*`^VgUYLu{~3Q15{_r(BncLj?O z5cJM*#bn(p;QXe#3%LeQPr|S+r4PFYI5?&T1@7esIj(kdkl*irO6c!T3`bMex!KRk zNBdd*Dv^aviuAdprKVsmPfX&1SR4{*I`{nPMlsJ`)DG-SpIVwbnV}%0uDJf|5m3=d ztL_R-JEm+y;$2b|3-*BOI&Vd9*BPxFe;Zyx!_2j|lN@pgKhC*oT+~XG9_ZcpAh#i<(@6)BvR*$?TQ=Vmbh5+E)2@L%!z*L;73Fcuw}u;#^#}S?rqsxr;ces7 zxTtt~eU6Ccn3dN503+G|0O2d}&BOVhZ&(=Mm6|pO`BP`{){v11G}deD)@`X8Aqv^x zf=KD#it*_D?=Q)fa~a@gs{4x2@UyfYCbmucL1LuiuLlC9VdAy9r}mv$G@3`Kt2-!X zL`Nemh)GpGgFNDwcNEO=NKI;Uwm9K{+yTMJ&#fqLn0;xPB9I@LQzXQl1T&)D#>;NN zWp{Ar)7qIbLmk9$MG=i-`G}BbX+1gbSM{ha?%2{D@GNE1~eNVHDKr( zmHg4FTg`?&N-n?;deqxt5}`pR5ab5N4M%a}rn8NNEERi_dHm~0(r} zwZup_ZV|-%{wK9-OD2(}TNJg8QGmkk!6)iXac$zMF5_bj&?mkdn!>r&9#GFE#_n(w zo@;nhiZhdD=A)`1xniUuw+sVh1MOHhH_2+nBu6p1`D7f0KJ|llsXVz*HuKaPk@W|X z(4=t|8-8*|K=rK@($Nsr#JP@X+j{PDq^JOMRMOen;b9<$W&jTPHE&P6dn=@tNeaiv zU@%W3(zdkZu$p$hfn?l-j4^StO9`WAWWQSx?@Nv|W&mQ%)6r%2PVw;pKURl;qShm;{FIHS8r2;kt-sN%B~ABEW@iY%~42v z)-paeEn98((`;1P813!vP~JMZ`y3y2f7USmRShl9h~H)Z01S-j$(_j@MmQezO~wsq z%%N9k7^i5~X??BQ+sbhhNH?5kfyw^>55~EwtsTzFPWmE+#kAJ4n5T*3kaAUq4MASP zu4q5n@`&#&(Ke^aH=F~g>_7U(xNF}AUqECK10jrLp~A5q-nGW0j)w-0VxAmi>|0F#}s8hFvr%1Ah95EO{-Qm%X2my z?TtoQ(r-%*CfQwbPTb%gwO~dYGD>!@VcgVaHi~9a3EED6p0x2MShjLK2+c$CEiU4! znMo0is5_puD?|)oOK#wheJSw}tM?dsfHR6ooUQ-Yo`w zJ93uuE%A_huTRdZUFsrO$PO4D$C_l4% zh$d%b#|ImK&*xfZ({v2Tp`{C;kJ}RcNl4uOB{sdanu^bKM{@oi#&Js@Y_ViO356VG8JLadU2Za81L@m zi^xllRwih$PgJBvqIWr;Mlr`U>Xt z)-9+D!R9*LC}LQ07;s1Ot5z8?vNS&~eq{&pBi^a%llfQCMKJkgh-Yu`obZ31ezny& zr4{61PEVcRsah+9$jK}96?z#_xellCs3VAD2MhtgBbh{2fmr_lA^j?N?6@X583*uyYdcc1y_$6h;&O}AfIugK@6wZ3Rw^=ZYD=Rl zDH2Kv>GbPcDQOG9ghmzdox-q8H%nZKmeY#$y%J!T&KU5 zVzAClR%u_kZKd)GqRA?j-~i{XQ<2y)m7KS4nDbKq0BG1WOwS>98R1lpJ625BQ))Aa zZDDdwaI2Di$^0s5)0CZ>Ix(2MJtfT+n|md{%ORRtIY+Dx6~qJ6Bp((D|m@ZN1r#pj_)Os_HUa>T^$L8~IVkaS%hlf_e^vIO4hk zJ1eos`H1K}s)QDH@x}v2l9T*WpwH+9O$D4Y%pw+17C8BgV+NQdV(dwAbt1$UrnBd464~wR#Wbun*sphSyPzWnJ!=}~ zY-9xkJn>WA-&@4QNpk}F4dXSBE%lJzR^?l9-Hi99CnQz9muO;p3cz+GocB4cYpaN) zQ?=EPpf!^HPXtp&H675=K^Uh6<0p@;D*#6r826&)RwJU6@@NWJ;8PG9N>hQ6Nk9b@ zQ9uh)p>ze6xb8Dhik2xhHW+131R6|ecsqz?Vi}G%!CqijzQ~Ho&$}@ zG6!+#*wlzwvGNfh7$T7Q1J=4rEj6IGNTWikyyX7?5%sG!E+xDX zxRJ-kzqg9vhaA%eROmJd|O?BQ1)8$KBtw0FW%6ypF$6#{U`kV~=S0=H^Aqc8kS36Z#oRh!; zy$?v#Ur5vK>~(82x3srh6=XZx)aTTb+PLu!%H7WD3tRRywH6k~_f|JgHUV+>*BJMy zY_z>r=2&o2CC*nTB(GhhcB@)_&Yg2CkXc;cmYO~5xfsT$(Qfo^SC3M3UO(Em2bfa#CQafUG(moyUJ_;s)02bM4K0wy7jqj+G^a?b9{1 z>$IjoRXvK6+#YLz9tB9#f7-5O+{7Zq2;9TAa&Ri+S}|6WCDN3g;*KOL=LC<(ig85) zJP)mP-|&N9NCxElF!h#Q5OU7;RS*sY#a(nRK|oj|a9vqm|ymxj_Tx z)d}3s!rc%kOW<-I3(4i*HJc^v1xTJwWxnI?^lc{ z$UXV|E2y+ONe>RGZj%=wk;v#h25X0i1pyG`2IP+Tt;?c(m?e~fx8+iJ#z#u3iSp_c z+UG|^JU`(pm;KGn#hbSS<~Yybf2Bq|2wqV>o3BM70HE^-KZXt~olUDM&9@GqfWYGe zy;>98q`qg`irbYtm5)+KuVGxrU6*5CZw&Z~*N?N?TqmlQU-1=R`y%=TW}X)FSmnx$ z51{q}l{`mvq$Q*=v#@4Z_+!BJ&wAgt@ocEk!#G)sF;F&S3=H%5QYl!&M3N!~9Y?(= zfIQ4@V~%u?;X}j@!H-|Vt#wjrwiAWlWg8Z8ke?D>N;xhu_L{{4xb<=7wL2MRv{R65av=}yTU@|$8&MHI*iQPz@*0C)w-Irk*f zNy#RYig-+nlYv0SfJybKTM@3&%OAb%?Mg8a-TX(OJ-zB25`F0g!J3vEX>Lo1l(9uD z-oR2t1I;SO8kTlAP&?OarRmXFS{Tk6M<2XzUvH%il^C{gv{cs-HMP{Rw^VY!k*H~q zEjSF1aZINJ@67{6%PF?8M}53-NX{zglFuNu*b77Z~2Pz z_3UUZ^JgZSTA3s4jhoR){U|K*ZTM2S>-tkFS(9vxHkHA_R_ZES8w;IA>22*Hd1Y>j zqxC+6-l-g}_pws8G_MV6y32uSYIdLbX(8I5nd|shTcmg%%Em;WS-pkM-QG-1ovrA7 zYjXbo04O4|d)Zaa#uBd69Al0LTzxsD<=7N^oGiWv)-A4lsRTZ|mB|?V25V4!H@Hi9 zu5L9L7AZP_r*Qo1()_D`v)d)jy|&_;RI`*Sxa-d*xXZh%8+}LYmey0>BK*6gpYELf z?03f$xtUf*Ri=sI+Y9DaylcXPYx$9apq``;QSVOFZ|t`1R@Z ztJ;>Hw)XN*Gg-890weM;tXPbk*B1*y_swx3ZK`mf_apjMMlMf4Z6sK+4P`u=h+@YF z2j84l)466ug?C8*0CjVW)>FwGxo|K#@Nh}1(b#_D?dkS}js8qKa=`j}3f)FY+o6o5 z7q3%(&KR`#wn%`_C(P`l?;lTkous^VBWzKoISg`f?~aw9Yj1Mu6#oFDLI=wlF=-fm zPfD1ULf^S47lHF@-ilmS`WA2!#4-7*eR4lP zU!`7aW|tC9%8`${qo5vuRBt?Ssg4l99x|!(etoOaj5%VHJW174o03;4?OP~^n!rE& z^N0of>JKgMn7r}jdMXy+3bbH=Ff6k%_vF=kxz-iwAoK*{iDI%2t`+trbxXM|;+&89WZA&==Dh0CyqW7zT(DQ_b9VZb)PU{N-o`9^gdDQ{LF4eO z%iU*Gn(xiitY*2|cJ5{=61TFBO?3LD{MTB5wVLV{wFnzKf|5ThXQgp>ddSjsHlBDc zVh1dI;njyggd7~7(y)3PZgh5$O{>|$`lMhqc`_j3LH1svwQbB&I;2+es`+8J6u=!o zU#}f2h0xBUXJ;m}cXo+yblbducgZtuBo2P+^ZHj&qxg<}Mj5kik(kKKf=as)(z$BI zyS8>ylu}77TY{+ zoMC~_<5!;!a#4Xj*19R|%%#w4#-)B@%zNORRd-rd-22oL07eg9y=tsV zWkNApHcbmy%?rv>Mr^3Z1cAm+r&`Y-jt^RVWqA$`-na*x$jkSG&HMa7rX`ss*ow6whIppvSV>wrq zaGR8I=~}md$z~A_(9EHcRB`~%2kI%$W)S$0@X8S65;4v@)R4;zaTK=-!4$Wa4o^(u z@v6$l$jT{Wjs|dlI?J*q-%<&<00~wnp%ut>Ia#Bmw78yIZ!$>HA3Q0^;C18Ityyc7 zgXW!z?mHGVIW3dH!0%O-eT=HnyL`KgsLGGVmg43cHT!GFm*D}&AQPGgvMyTQNH3Zf zV#TsHkKxC9*m&)(e3ZBw@gR7ykV6hZ$f_3dA!pjaoC3om_756 z_|vrFZZ06Uyhz#PMtKk+Xz)6WpRazr&|gENhVmO79x23tqpGlB(YFpi3dx?~or^If zlgY(LeRnJBZ6BPUW>~7h9! zuFiMGo+Ou8x;D{BKF13X3Sc+qQ`2)w{B^kz)Z92OWn|>rgZj5sNUt> zcJv+bLQES(Cxgu)qKcvdiYY+jC#3)-OBm-k$?QF8fa9S2O+Go~i|)248YW9ubGf_o zQzfH2sw(bg;0lat5^cdyPpwU-T$j{ic^Ciz1M;L&v(VeL(_?}m$TH+HlN|^-!1k?l zZYq+t7l2s$RPmGFueWN0WueRXf&&()@BA`=!2X7^Ypd}jW=%a94Eb&gdjc{370K1D zj3lhZqO3({EUX3q&-b{i66rT^N4`i&T=rr7sJaeUxt&rbWsnDB&lxl}H((|`iRPqO z9oKRzWFDX4K#h+K0Uo%azFnyMkEa#g?yfV#(0RIpNh;x%B8;c0JahS1SQ}^q&|p-Z z5co-%h#3Ps^r&u&HKfXbmSifRbtH^YzwxI=_+VG5JY%&5H=*X1mXhihv0FehB#Z#h zPpS5=Sn&3rZ=ymiblWK|@5+;ND@V6~vXlHi)xl^wmZ9NCO-XD)if|N3AmNw3E7Wv9 z8d_gN_Jv0QiTN2+Nyl7csH$f=-t5l0*Dq}6D%wrF5s$pG##Ni1xvd=|SFzWKj_rlB zE?fa7Mn`Z@@~s=4dcxxEVV2m=h(X=Ac#Pwt4_8b!XdZ0Q!UAh=P4c2IH0ZvOz4V4V`e(IArNQE8nH#wwvp9OEQ?YW}OJc`z-! zVAu$hd2-5CbLuiVBDsBQ#FmiQ8Fbr`CCrCxk}8!2y|YDt!SOZpS9kXiSfewD2` zII<+JXlWWXy}PWpusnmR$D9F<=hx|5R};prckzylPs&g9;-zIpSe17bQ;xlBE=O6T zxDqhk!mpx_YPBghxh^qX3sXn}`GaDS$j@4*9r2t7F_vY|d@gG`37+3{GR%@b?W*!7 zLxaG`?Oor8{vTg!a7TAMsb^;(%wd$Xu=NX^0DaAGM!aD8r=mGAR3}y{5La#dt>$wf z4CJ)2D<52R{3)#z2o7R)Vf-O>*QZPHvq?*Hw*LFf$KM4&{IOnP;{N~(YQ7n7v^JwG zwq)UDC<#yZhCh{bEXtFWwfUTHD6bWE)x3(cNpZ@<1E4%-HB0+G;pE*kx&HuofmxIO#ktv#x5-nt=S=N0Z*{$=oE03tH8pT?-_3aHB4Q~(Li z1zTBCWd=Nqo!G}e{c6nDC1r(Ft_k1}bDGMdYBzlisYP?ybuBN880m3ZuIR){6Ur@^ zoO8}GkaNvXpvy1XCHqq;Q?;WgxnG-%41Xi(T)aAM<;+4@ZQao4k?q#H74V(Ko}9Ng zdW1HTFw9X%#0}j*+ztV+G50-a-(#e^x6-eSVm)$5E+F|;l;aKP+3&|n=loTA*SGe% zdR<98Hp&r8us<_({VN)0jW3Ms=6irilgl6|Z%{{3PqDXsM)~ECGqKAt$MGNVsAZ;W z>8mQ*lfBv{Uz4r}`Tn)Nq4<`<*GagAfJq>7OEJ!Ocl`LSTUgYV)*)qX+=bf2Ry>yp z_B6MSB~EiAo>`Cd1Jb5&=2k8<<#$HDtE!Dk_(kRs8Dd?Rt~~~7{LFt++9a zDVGQo06xEkb5@Z2;lM%{mMzYGv04IDc4F!dIskbUN-8SImnSE7Jli1s+`5Zg^aIL| zB>w=wo3%%G2FDK>CD#Ka@@iG-kzNL)a4dsBps|egHtD{($LG4Jc?c( z7>}45&O6cfHgj83qoT146pnL_wN2VNvXAn=^QRUjijqiO034ISC$&|n-C8Vz(2NR% zR*1=fI5luwqqayWPC9g|RU#pCfXWZK#c2l9(1w%Tw$Mv3+_DB3^{THNh;Ts_EO5?@ z8Zb|~@+$uT+GU9|brO+|4;ep&Ys-D~CidG?Ck@w{Zh6m2o@;|CJ5(t26$NE0Y@N$j z>Rg^dBNLEEYvfzD+OiR`S+xb@k#kn*n!9crBw>X@FFrM80{>0Bd!f{bvo~%tsTlkI@>$#ApY-Y zzyqKJ`qqjSju0d}!ts{DB-TyAf`2Tmi}rxPDP_qdALUcWI^FqjTtZpHA21=|7<4=u z3QW|3CbzquWqrbI5g7n-20H!}){tphb;a7W^GXO9mAW=@kEcJ5Dc~#M?;{5}4UExq zX7kvr2`MZXHt9}F9QQO1#+vNZv~Xv05SAcal;dw4^XXP)OG~{L8+jv=NDkF1$W*%m z0BjD&t#bNIm$wi$m@VawER>d9a~hoGMtYx5ty*Mx(pw8xkzulM(X?vY@Hrso?=k1t zVv@g5xzoGZB^ zOue+%^%<^?p4Rs+hCj3`hQP)%?AY&)D^?h_6xAWptnBC2u9h<5P`P;H&H|H;d-6!@ zLn%I``EJ%LXSkVyg;Hc}5ID|y{x#&9jqJL%sV&O_T0`=v{t!A3%DZ14=(qYMvfW$> z;5W^&;%R{x13z;)9R+cA3W%mc#7a&TM{$l#Qqf$-qr0(jj8@SELc%aJhGENAOUlMp ze2_`tgNmBwK_sO{HjYmW?T$8{9`U)4B3|3V{aO@ZpmG#Eckwx^- zNo%SxVS-8J5k$MBJrs36Pg7C6ZyRlODt!pWb>0#1-QC~BJv&Xiwvq-(mU)qe3Zne{ z{)6$Y4FV`b0L&bX)X(Sg^mKYMmeaI?V<%6 z>dKlXnCWQWX@OaZ!Yd%itt*Wl#v4yM;u0hSB@>K$VzqqReKH;HK_{U-s{>oTGa+c@ z9$R$V&owbt)s){vaGLZn{iu{hP=4uN{{ULex0`DPVsZ`vtu0Dcp-0c3QmJUF2@}S2A46PncVQjD6BM z=ySHS8+bjff#<8J1<8jIT1$&@n3zj2qPQB}vtJd9)sxi|2$ImK(g4kZC zCcMYu2B!tqm3?l&kqQ=O{uUVH2D?p6+)o_oY9zOH3=yQqmc(>8&Tw)vLv6*uG=P;cWm=&tg7L#ncacMx3BcBYRJs< zq)?}vSl}QW*I0DLw>IbP>IflPAHBK0y%+0MR;A-&X;4v}%vj}HeOgGTGQ!bQ?!lU!3v4!5ORzoh<@9 zvfe_n4oWF6_*XT~mWrs&FP8v?ODV#YURd_6N!)1~can@6S06DfdiTJoG#w@7iLV%s zEkQ*j;Dg-#D;rdirMqJ87=wXEhkCd154@+; ze}z>C2cSP%Iv#jF)nq1LlHJpAjgiPX-5+u9Rm8KX9FBT$GyXK#cWxmh+DQKPGg}%~ zpZ1HTnWJSYcLf~}p`_i4O8XWx`)Q-MUJmv-VtLKpLCG1c_r!y~K+heiml6qPBfe?&*7M)a zmbTFuCUb#-*WCXAjapVB_bNNHe5JAlXlYs<=C2_Z>4?|wNZbKkwvq7e$}|(}7jF0& zXrdcbdlx*Pzz?l#$EbL9WrI(&5&&BaBifOs2L-eDg-oiXv^J%Ml-j#m`J9v<34leL zi^tA0z7+*WIOn}*O`uuZ$icNF1byEzFs=CK`PH3YP@S&jL~y9AR21v^`_wkUXM#xN zU*2OsG0x%m(5f`zi82S)sYs?rK4hxR>g+)ufvu$9YiJ5h z`<>V*{DJNC@9k9X-64<8g;iOP-6J3h>s1rnRj z0Ixzj)N)GpdlaO)7p=C-mMRddbsRAEKGe-hRb@?pV<#XEhZM-3DM)rCNCDh2!vR%i zW*`oOaX9oe=Op7~B$}1SMRxHKpP5E@&NKCf~HArqPttk9L7&X10)`VkyxPAn_p;`X(|pfM;Y{@#VKfYF0fy~tgKuq+_-JK zi0$poPcWL`#^zSFF}jVvd0g@OXCk?nH5Y_F)xly;LFi906=)#%U#k2^gh1aAA06fS)MCzBP`?- z^D*ZYJ+{Aapup_Q1K<-G+UxJT6j*%f);eDgy~e`7V}cz$1A`*F7s2_mt<;M2?J`!h z@SIt>&BfR6OTLiI4#(7^r^8|va#nrKE9S+NiAMwkr0;(0R?gZ!8zphu6>_m zX&aSRMZm!0^Yx|HbtTocc`m{fM!*VANC10PR$?RDydO;PMN?f2uXfnni5ZvW0f-yD zDmHWG?j4B19@Q*K9s|iJQW$)tNyzSNwzBX_c%J!Am33ur3%a3*F=k!>X9JwBdB%9p z6q`#%E>`v^*l8CU+&0&mbT-cLke4xr+VXuzLy?}HYpu4_E<7#b=FmJprlhuUNg25% zOS`LW1>L&=yMfSGpm=A)+IN9`Jsp+8+fQ=arn#C-s$hNNyBuI+0M{d9;;a2r#MY4M zx(%#yykT!H#G~f73{O1*;PcD@_vlV|ikl@zs9LA7rtz?mNqlNF#5{(y0>mx#moZEz+cWwmacjx{yy( z?_GwO5(xr@3YY)^+ZD>`*-cIcn3WmMPo@obmI)MFOE^-;Ba^w0_y)WX3d0*$=Fmf&PM~TPs~kNUkij*tGa#Jeg;A z^9ej>cW zf!eUO4MJJ3E?uTz0l*453I|Mkn#8utI?kjWgqzgHbAg=kT(H#&?v2nfCn58Ueias@ zey&um08#gGj=t5HQdv;BV5cAeP7PX(%-p*L-Q!$HNKm62Mm;LLM2V1&p@u&?Un%pA zzo@Gg>aaMGp9~IE4#u_7DpJ~cQw57d8M~5*KUC1obk<0JRz^i zD#1H3Hq`b?PExC_U9Mqa!BeOmb)KT#AX|eIn z>($8o1yCWsbcBLeDsk* z>G)SMuFkvTYg?elX2)3AoD6sO>MLUNQZv{{`^iNBfHAyrkLO&yywI&uAzno3j0&c7n+anSwvAN z#^aCx{(@600pCPO-UUe@9W;c$n|3=Vh% zj31|J$kngoSA)r5<}%q}*$m&_{OPw-Tnhx6@=+_as^LR&ym$1@D={MbY|G`uB0+%6 zppZTP0Q%G<4VN^yq?%7O$O0fOB$6&d4`b?oO3j`@HPn(0Loq!1R-OEU-rU=Xh$Je7 zMm(-Mu>O^cR=bfd$I#yd?95&dxVqVcadgBVP@pgP*LyQc`j804Ac`VcdTw?+XO6&U z@vKjWw&O|occ=NM8*R+*fzLT=mre0SzOwp?LXRJm%E}+9Qn=^w2Ds;|Z0@I|QZhVO zt;u6yDMlf%a!LEO^Nsb(-d@ddBNk_3#Gdu(pA)sk-fW^_nK?M@J?qN*^IBoyWaRpp zVyAYZx(P)TS4qfj*ffz?qa&CL6S>!d!|PB5m=I%Mmn7{i!9KjzIIdhG##{GboDZ*B zbhkUVs{Pi-H8w3ZVbe_^$u2<7NmEXVrk!`lRgrKvN!z#iHAt~w7m@{O!p-(%%7t9x z0A$egK(%Ix)NXC;%#9Rn3ntL1CIB4=70tUBV#M-IT)2k}k)yL5U<1qa<}C zQd(*^u7nie8Q3TpC!iH!Ve@bm2sz~EfE8$^)IOtW+ME!NC1$u8HMk z7_4qyGb%=Qslg|;Z)qb@$@1)4RfC2xwm&aZRi61?dsywE0HbXZGVp!)>+4>PWvOZU zPL2}kJe|WW+~==fQ_W*iO>B*+LM`g-axbVr;!L*5e~KwMWFwBAwUIs1k+wqWa(NBL zex{?>?xEB*QzfwlSm9XVN&42djiu@7Ci}Dxw1OlrkXyLpu=S?4xJg<~8B5{osR?cN z@dqG*B$!~QzCA0u(xTElIi|xUyiwh}%mmXH+IaNz867IK%VvS3XrY5?B_Eb!#yR@d z1+y{pZuM#FN$Y`7tJV7+2S_L1BI=jB|Jx3?e4 zv9I+Bq#JzClpZ#W16j{=EBvg@fzShro=#*^pCe4vnpn)uAay&Lh{(V-RqqPBZ975Z z1F-HYrf{3fm6Ia_9Y?JTAX4ngu1`XHd*ZD?&Cc%R?<<+ZsU#r>2j&3%DQ>3?7>^wo z{sNh4av&@o6_HpT7@TJuepOdJJ*w845Z(0xFnHurW}ME?jPr_~($Y)0Hl_lOtWE%` zO76&(?WrY&walz!fE+2oAb>`FGgcN!Tm-L;A>$t>vJA$p-@K$a}e48 z0DG~h%#p;R8>Yx(fF~H*eb{6X`qxZh%B;*>uI`N4uP0cV+U7Kq9T|oke~lsn(d-dC zqaW(X!C!DsKhCY)M+^z&x0r!|e=1n_u02(O>GZ0Ju}pPbj$c-Nug%3M2t4pSY3wk_~!?$IGJwkp>672TP=~Cw{D$2wNmm+qda!+ zF=D$&o<<-KEC6^N$F*C6dorrhTd$Jt8H@3T?T~(y4##_>YuaaubSANf zNw$gzLNKt5;ZvV}r;O*n6^Y_KRrL#>Iyjy`us-=Ac=qHfbmymE(A6y(?%TufCCoO_ z%Pi5f_bQTr$hr9y6Y51(@qPMFr%5r2NM3EN9_|k(l|JBqIzzpc&IrEwS;zwc9q=0 zPCj5VIIRdSJTs_ijiu;vOJ;&*mS^3u#Jm3hi$8dh2t4vDp1bjiT==mxr1DlPS)`2a zljmXvGCN}(t2-5Q3&m2!aj4lXtPqP`9hk9WWTVI2aDw` zzD=MK25@~dR`rby-b;xz$)IGCMPQPU5N!t_9)yhN70k)vjYi7p;j_90BT*P=h82hy zEC;<*Q*P)@q^!*QsOS59=<)ed+l_`kN#Nu9*F~plvd3(xHcrv^WKb}DtDUjAmh)7S zN!+}XDcr}o;Qs(h=_Is}G-4o6B;-ou91MTqKIk8*ME~MB@Y#p8V6i-!V!m zl1S(UQ?92<(>7_VcE=1@^c?ZqJ-DqNgef*x(lmRUr&eG*z6J}0nSVTJ*h^7FQe^5_R{Hr}%)V&hW zuRgmp@(VdoE1}85aM{iP=g>iT)B3}a!+BO%Cm1TCAhc}5E0j=-Z524EOJgw zQl^;>Y;(%a<(raDPI;!5C%ra`yAgIfarHErG?Y^du+0=vK!UV%{Xy+rRiF1j=Lej9 zD>{lAxYM!FOpzk%PC_>(-~Yxzl4q4yC6?>fr6UP|Cpn0J;zBQo^?P@))Iv z`O2diPB`v8s^qqTZCw~C*@L@(O!ntBp2md~wX=pKn%!08KyV}(IR5rMeJY*Kqj_w? z*5Wbc4niIX&pm0k5XO?9BH`C38&Av!*EO4Ru4W*ya!Ed>gs#E68v1six_qX4H*YMj z?_y#yN2vA1L2}D>rNz@b=Qw5RJGZc`Y~a+;4!>b+G z9k2kXETlHp8`Z!p#NeL7sb1badom{)D!G0Er&?eAHBBv~}u zOZAWVW-oOgmJbe8 zjE`I#8o33IjRcz|^Ji>u!;nv=1u@dVp_t6lToaJek~5C^=M_yRc$oRIDJ|X8cOIac zT*Y#vKKoTmi$}GB2x5qkK~eKC9Ra|sw78K}07(GkXSH+zAa;xpv6KgQmbl(K4x*rv z8%CVQ?{FXl76?XvEK|@<=11D(G8f}<7%WCjFi6`-1CHkvyq0=gSC|*3821t8I3>d4 zt}%h0f|pr+56D4jB=g*b*crZ1fOIF0XwVhWh?fkG0ANRa@C`}!=4hlxEJGaOTLV7+ zwNwGuf=6O&Z$i_S@?SpV2?88}j0SJ;6f}_2ZJHVdn%U{?s%j3x1OW0t4Zm&CMot4T z?~hu?)%B_MIhf?6S@}>N-t~J`)K^rzOJgsQX#n{UDdhU&*0H4W<(a;4v9C@spTebT z?#A?1MD_^T-M9dx9OHt0{c+C%wPgk2j$3I;OBp1FIN#4;J5~(H_TVWbH&KS{MHce~ zbH68!xy3fGsFbatxh>F z+tjB8Mo&1U%VL$@#ksaTK~Hi5$&7+M{p&hZ2*h*Oij}94BJx?-s^kUi4|;{2q&Z`g z(=_6mCQUsCQd5~(cp#6hC0Uqf^rWJn8W(V&{xlDATWd+~NfAcO0pyO=)Y@6aZ2(fi zK;-ACtlck8l6CncaH@XpFc|%7SS*k}%ZSRcJRa4x7ai8-brg{xmMy`Q5Jw+cj^^vm zLZ;lvgQ}5IJ2E!Xfpf?p(?-V81TatozMEVBkcdG6*gQEuOW`(t^3VZcfD+*cL8{u}=yB z#(nCwNh3S9Upb~$P2JqFD58o$6j4P0U9`S?Whfc85)KD%TA8P_ndFjXl*s2f2Z8?p z>a6aafuyBkuGWUNl(*M1Ar}p{Go0?^j^iEw026R6fq`Qig_*Wqq>OY(C*GL zf+?SCpKC9g+_vNM9<-?|2#P8BLCFUhKDE7|Tj+4#L3yc3EyM`>zYr6d>FS{r2bpp z&gL}=S)2|#bQuSeQERJ|ot=t_ufZ>hp3tmm9ituTC<>d@(V|67#W02jyI^ZSGnAyfb?Oo=o;zKOkh{wvA90nf0 zop84b7UD$>xCT7>)1*y8D%-Nx8VgA+S|!^c!#+>){jw673cTQErC>e)+Ak_aDL&U`+l{>aF+-9Xi!*RR!5q?Nx>@W^<(YpfIHW9C z0l>#R=iaEpp~_C_WJOWQ0l6d8QOqt@WK_^X^Gd3USd-Xhv_8g-9FChq4ukJw^sPMw z!rI)kl@dro0>=a|d=A;9hMqr9owz+lL`L9l84bYMaUOjhR*;?jgGSQp(LNhd2NZl?;Ofj0}!AIjTra zky6rAnORO45Pij2nBc5>dS~lfI$wqFwJURW#ye|4{{Stf;2)sxPw>u?FEuD-xKZRQ z5)}Uc%R)bdS9f;vEwRBLGXzvooud^k+hWqb=P4hCbfIcwH}YID$Wt1HT!Z-Yj?{Q} zLxS?{XL82gHa{U4^D)ytqOL@-E%8f{=3PdAc8e$O&PO>OwYO)Z0cHzsiyWYhwtzpM z;Zb(WjwL{c`1$7*iw)Skk~n8+8Fv;{W;hLs-PH64X{5MC`ATE= zk`a^Gsrv;iCvO6jzsD_qwU{4Dx9dvADb*75=bDB z2*>lLm=MJZCB#hSiUtMv9AnzIuJwCct4QNW(myN^tOhZkTzAc8M-$CuEUy_zqY4?e z@H5)79ORr6f_u=F*f(XM_hu6|4>>%8pVq9wYV#{FV6lz6vB(~oJqP1eV2P!DtM{>y zo|UmHm@QVq2NE-a(yHKwIsSD|O^I6WW?Y16_Czqbzy$N3Q&mX8#aw6#fri|Fi;mP7 z(azr~ln&#QpL!FpINYqD5}>vRe`rbo+A$!r%a5aT@uBvs$FubA8Oyl@&oJ9|=@qroO+Qrz*! zUuv7MvNGKOau;?6I6Xx~R;-z1l3lHX*sxx4Na}H&p}iST^s2ii$(8u;oYlK)XOjUD zi<9zzaB)qF5h3049iS7D#(gWG(Vz=6yL{~32Vw6@Qs^|S?sOg;(dBzL@@9-qr~Oo) zF8um8LTb(0<9)p7RmWJ^a;MnTDJR)u0bd+v@vBhkiqN!IXtSX>&qp6a{xuhrUDl^U zRHl?(kH1E9_X@EgMw2k-<&zx7ed>&{T3klX*GVLR{H1e)^sPNo1d=2V=@hz)IGq>*~et7mY=ZHk2H9Ud99qTzwyOkK)#zR|eEtbe1hG~*O zs5r(!!K>1-RV>{B&uXmb8G&ZQFyM2=B-q+2n3%rbUgn8oG2jvS`qh+;qasyp%sOX) zYB^b)Vb=s7;T1|m@|Ckn23QW6psDMQDJqGu`B_>*QE<87dsK8$K!u`!rMe0TBdq`w zl;Et#01|zw%yt)21!aY>#{ix(e;O`3GOakaP$?&lo|Sf8Chips=m2g9%nv@4;igF& zqfU&--9$ta^v^*`=xQvYj=-L92*Y4@s}ZP-g`HSL#~X)HQr%i?wn>&fgFLA@I6Vn8 z$gW4(#14v7o=$LSH*uwH2_5X?kI6Qw?my+S%G~}PGfQ=Md#I!_$e^S#gpERl+N1vf z*VddC7@|^>E5=)tkw!E1sT4Wf1d)%IjjMx$+uoJOVu*@T0sG)!1Cm1fQb`*r$RM7F z{hGdsBvStXtZ-X9RQ=kocjbu+vE@cLEeVCJiefVIh}Wq&6}O|8SxRK91=>{coE+9| zq;RyXl9EddBM(ui3r3Q4zeg*j1};8n+hB$_nIBLjI;k(zUP zOP+Rt$N=&x+`AH^+^yx{h|dQ-aZHg6ZNbPr2=7jiWzIna^aRqhhsqIcQ-hGhtq4q> z*HO5%0b?U>Nh(JNJXW~X#ln%T9vgCyKA`^qT1W+w?IX92+Eeq9^956mdm3PkS}7-ShX-$_(M?NR zq==Cc{LPTTdW?F~U76yIRT+?;zb;NaaX=Imr56kpTq7J7UOyVWZ*G?8%pIF{4jZmV zdaDh(CApcd4oDyZz!UTb-mF_{0y6Qfyk_I>$G990JNwbTfMpp2pkk*0fsir>wJSv< z$Gk4p_0C0iZkD!Iv7|OCvnVBFbvRT#SmXNhR_*RS(9yTZE!H5rl>(}c{84_xOS$J( z+HKX`s*%KU-*mSC{XKsQ>uvlkX=PzM6O0imE(je6 zA5&9ZT|=k3GrXDQazNgA?V22$1d=m;$mK0#hfBAGRCD{VSP}K))S8W|T*m}gRtQ|O zhmgbxqui@~qE<+{mSPo8;(0vMD8tlcw9_2f zS)p?u%u$I{kDTWxl75vnmZf_&+Quyd559TIk&c}8BbwNIy^-8v803gy6`yMs^*Hya zx1w9c_KRhXNrv?MG9AO*kVh1P!&2T8aSh^Wsc$nKtkM7#bKl;kmspWi3@;M}lWWTD zh2yG$$*0`SQ>9lwJsc&SO3sh+?1+>y3V5bcy)MM*Qsaf0EwUi;CxM&f7ce@1- zbAWITMO3)dwB0-jWl3kac*c4Vqp+gFxVm5mEi946GaO>)0pI9J>FrlmdwEW=M{$jA+pq;$ez%wY4`MET&t8&+b~sIJ%$c*nnPNG(3ga=@Z23dM`$9mBRU)3RtvdSKu4L!W82rIA(yUx*dkkq;9+bOvoyai z0OO3-T$eWQwp)+f&B0|Lwnlj4H9V5tHJmnqW1mqLY(}dGBo7vrS$RnWt0P9pQBQuAY)hl&YofVt}BxDoQ zj`c0eTC~6-fJYfq*Wc?=V%yOX1rdVkOD;Ib<3CziU0jgJfr|C#k5A5~BzbDf!+#EX z5%i|44=Zl~l_UeSa45Kn(ZaZls=`g0I`lZ>FdcFk({#4w=F9 z{3=-)qO^@k`G6{sfN{ymBz|9oWgQao;l!n~bGT$4pGt_D)!kdNJF|JEL*b9~-6}Vc z)X5dqx|LwZl?+=vbI^KmT#>jyhB89A1oC>HrD$pz9o^2C3AZW=s)xzO2p?Zu_T*r8 zu1@COSf@xPVqKe@HV%DH9Vm-rG*2vTZ#!Hw?P2#uILC5x)O%G^oBPmq;S}y}cn8oP z;-b5{l3Sm=Pa9hUI9`>h7)T)78al+7!C`{Oxg)JRTToJV<>)q90_C?AQQVrU%=5D! znOD=HKjBjdOSb1ZBaow_)TpXC&&!OQj;4VWj0=yu@{T#?m=prVhhF4W+nJ%aP*@h| za=ha;N9~Gd2$5hmc_+WE7GZP-Fn(c-_NF`_gVB9)NpSM6)>c&~sSA(iOex)G*XT3fG~3YCl!S(F|z$u!UOn{q{@D#e#*R|l{6xb2Fesz-7y#+eqD8A~M8*p13^vRcU9qwX?P{ zPqA~oV@`wI`u;To+|3{g%o%trSdOQ?9HDb&z%*A5V+xE{J7PdT4|6!R}G1IW&dqvT}(fIr5q zyw=w7;BAZ$Kzwjd_||k2q-taPZwzqC!=AmdQW(h!leBRb8<)&~PBM6+!Ey~wMw-+W zIm>5fbze%$hRS)^EOG89(1Tqp8f%#usx(pU<8aqTq$nyjIjG%WX zzpV;Z*i9zx$|BC%!X#$4l0d78PY3Y>)}T#34MOm}n@oZ{?h`Na$MQA31a{Zs&5}iy zCpi%)+mEU1nxm@e&{#w+?Wgj9;ezs8JrC4#LK{U}oi;>?rozZeg;i6|eFbBGXWdGy z@&$o+5KEn>KA5f8F4);f?#KWFF*1@%FI@isjZnB#CBaYmW`1P{wnthoVKgPCf|4T4 z_lSv&*;WSw>snU#2pmHKNL5!Q+kg%;$Tgh+HuG(XWBG7b%o!Q(G5&f}?&P{j)>gO= zxqxEILHEWxcB1#(du&;`iQ=*kZZ^mlBo)YP_Zj|`hjBH+x!oyw56vMQf=@tw>q2{5 zg@-ZT?;`-OY*6BK0NnXNfH!Sn$vqF$e_CnS8o5R_d17fJkfLCMq52$Wq3u=Vfj4f6 z$yPWUy7c6ncB_|GZzPK03a?HXtBS>$>>-XY8J;;YkP4O< z_aIhb7;JU=RN`3zutsH5!CdzGQa}StGJ|P2IT^(?>?W?r#BGd`ft+Oe)!#PqNaI%A z(PwIop!NyALQ~v;ElWiB4<|EaPNd)#j-%6fqjYh!)lUbw-U;_Bh0a4^L(!}0W}8E)ZI84>w8=Li&& z_;od9Vo5F5Dc0gMyBkyn?w}MKuO4BEkHZUjOljyj+_ZPLvix47{+qn z;hJP|%x)#!1abvzqq}9Xj-(Eiv1Qm`o-``k4hBZ!jQep|x`pYrm1MZJ6ERZTBO#Ao zocF8u`lQ282Hz#rf#tMB0nRdW(xA1uh{?OjgiOpl*Ja$ry8toQ98fa634f@@4Yjk` z!v6q!%=wsg=g@IdK96U2ZWcIM`Pv#lPVRp8JN->`Ceg$TbA59ivaijDBR`kYvMx3I zZPP(vG=8@gXx9mDkwqLoyHa<|z5f6@sV9f7lIQJ{&l@VP$kAjdN3R*G z*S;@zDP~Hbk%F|bg3=B@haJiG;<=W&)m@$lrj8jgz5L0`$bHE^-KYzl36=|dz#&A5 zfoU2>+n##h@=w;OYC7bWvW16JF09|WiDSu(b4CSwGoWGd6l?!7a{{RukOw^BaXc6PMw6^k? zXK4Ae@6SV0#U-zvma!=eq$FZlapnLy$UJTQ>cqzC+TCPWlIieORX}L*p1X}8GGo(p z31{=e+(fI@5x7(JBl%Y4qo5wT)Ema*Z%<3Rie|PyN!}d zt?v^9G-Kr+!0_w;-d*_mPrY(yzY18U6EydFq zErBd!B%Q<94m0j*ytdkN+bj{C?9GK~WI0jm#Zc6Btw&I|5WE{B3<25?C;3%dd5X+D z$5h$@!#8p$X@$|$+gPoTc8KkYK&o-cKA9()k5ajd?29U)Y$!!x&H+6*>Bc%!ttMNm ze<(JG;k&O1MQCcY}m|imcVYZQvUU{LrZYyh{RV_bVd6%!}*YK@4Ev%s0yLZTOu&}^WR|=E31!B3zO-qqU84IT(wq}u}2b=-4eQ+xL zjOB5RsNiD)oXZ-xM%=}+Mgi%Ht8^~Zl!0dOK3o5tzq8eQKPw4<%4uw@lF=S7?wglYz%a% zZKMba5w!H<)3rU>(*UUo&45oTKPq5GEm@g{Rf*}3=glm3Qn^%67d#TfpsUIOlNfEJ zo_gcy?N{v}EW1D;j@^G+n~fomSmNA;WZni86wYb3!txlH%M45gLhR1t_!`xX5;e#1 z4z)5cQH{H?lhT5$X2E(46Ej0HDvhn?jiCM@{*^-N{{T<8wPEFXzFPoeKPzB&0QRl< zc5D*Nzqw)ad;b89VQLmOL(O&Hn42V!O971W>+TIPGaLIkE?wHzUADsq406hSG4-k6 z!|cn1xELIO!msHt3%P`f2tqkt6yv|;)~C7oWsyX;4Yw*IRLMU|F3GYvqorLbET%S4 zdaLkh_S${S>|&1JFO1}{`MtURmDAc;qr%TJIubtX;PLB$R=&v$ac@L+Wf?^r7Wz}W z8j2}eXkTV2Gc0Q(u0o!6x2;sY(;Z_n+(@CA?viO!cGc;?9lBRZX)J1?wgZ!l9-xm} zl6d2|LbIaqIsyk?-~DPsvC2WKTU$i%+uq8@XbSA16d$HJrEPK~NnxHOoA-Yq2Rs~n zr?1yFZd*9Sf)q$ckPbM{J%JU*$joJCa(5DPed(l`EwpHTuIAPJ+jl6exe6O0bI0d_ zQO~MeT*4!kM<8d8Fi+DJRyApUK)@5zC$F_^>AF?a+mXPA1D(XKOL~9$>a;|wvo_k~ zM)87$iXW9rU}xX)t(fDTm(H3I9!>zxOOQJ8)9|Y{z8|rWJg&{QDH&ObBcbYl{dIcO zLp8*ZE#KM|Kwx(5!OuhMz^Ic?Vxpf@KFOeH-eWSb5^UgM#z%5~hLTyVkeI~I(m6YG zA5J^gg!*){tjLHH11%(A0^D+IGD~ki=9Xm}S7uOc!OyQlNMsia0-=}fJC6fuE?0MO zPZT_{`C>@J2-NSx!^4xuUtTgkm2Mq0T}rIXWQjQ58);%Yem{*#c&lz6SuUhzC;eG$ zs(NwTAJ&i*XO3&Pd8QG$jAL_xbH_}6T?IGKmF?qG9AU#8U=<*Hdi^TRt1C!ZnWlD4 z%)xjboaUyP0zmH0aC#^iAme}sPkv|>iV!Qbs!h0*V`{#3mHLC$n=O>^#IiInDaXsT z@>}VipGwiNS+Y`TVsGM5qvprGM>?ysjhm4hAtP*%KhJ6|I~hzNom+5p?qmzs@T)^l zQWs|CE}$*|9D3%hl2!t?@kiy8nR2+sd*F&_fADtA+qd7l#=hRgRE%(aTA4V#jS>}-I}(B;hJDy_xaxh&_4?pU$yuL9?;)B9h(m#3Dq5l22ZHRU4glYiIjBma9Fw60AO8et?|+09s&Z zT1wfq(~GRZH0t;vX^@33eDN%s>Et7DsxP2!FOLV*4}45LF4~ z5skR%+n=pNC8njMSUlq0%z;?C`FT}dfalu-{EA&bOM82$;e>gs=Eg9~j!8exs84Gv zT%(29>i+=1xpI4T?^(~S-N;ks3P3nuGIDyKt!i0FCA4U+FHp@VMK87a9SsdMB-1st zq!LKeNjB+U3Y&@hpmz2r9QUW{myaya+gO1s1wM3$Gu!U{I`d0p*2(*uo5i<~j(09L zfz$A)Zeq5E?fi{m2WeFP?j^xv)1Jhd1}&D2b8{GAiR8K5AI!nP_Zc|$t%K9T3lk9gO%&YUoIUJ6g#(1uR!r6Yr1?AGqk~XZ2 z1~7Wnd!G*4UECQ2+fi~1Q3JG-{o{-iO2asq<R6su(0>uY{ z&UI%$zf}2IQE?tmO#9PT+S20X5_kp7QSpfZWn6aj?^)M3(^^~z?d3@${x3a420oyG z2*qb=8im}^h;F8~mIA{nictG=ngH0<4w($zbgd!=`4J;#+Q4=oAE2h&Yc^VxLMY>s zM!^aa5)m=#dvRHMHHMy-G8K5$vw$6m&t3rUT`VOcjV;FApx_m7fU)&n!`hf!!H(A2 zJxVioGb}8`Mt}@6<0C$lw>B2~jD*};!3(R9KqL}Ap!BPH?vfHEkr~=(+ssf;PoeEe z8IM)4JDe@aU9GuBPClQNAo3uNTYDI{#S9TNV_}halRpI)_n zbh|+5BU-1G{3tR{rDfgf_f2>E(jY1@krm2;_3NLlYT90<)InmnU$rnFGOt5VQdNrw zO@WojjZgu`6ar09eKpU_mBkh{rLegKka-m10-ixpjxushMC-M@!-Zf79GU=m*hea_j_08I)X|~@BjpDf zB#aNhRWR&F$c3}~Z@{OLb}0pzmH~-411Gnq)|iQ8)9q%xKhE6FcfwCYN*!jo-zuvwh2290sO@@+SpFU9FXuLx{2YKE(5ag ze!t^VD$is#TpiG`z$1_|SMGJOaE%$Yh^Qcq=W#z;Sgt0STr|qK`?25vqv`2R;<7Vk za3zjq+k!_Va&go1tr?>d!_bE8$C|YjwYx&tjTmP;Lh@U^Jy|fZGztqGWsXfNg^Y=0 zdvD$oytn{?)DLQocDS455xj90-JS`?D{6fv3)W>BSr?|yUwT__4@kR}nPOHPDLctN zri+D)lw%VkBJM5%o<8^1uEA@0Jn9S&2V!>dkH)$SO(Ndh0!uT6DjzuO)~vsm00c6o zKia|k1uKc19IqJd&#eGOb83wYj}w@r;ea19hV~h(*Z$JJXO2HG`DIHo z$AJF(yH)F3Ek{ogi?o%Zb^YOB07m{0zP)~xp{KpdMzF~rru7*%L)2pPLfN0-}{wxL~aQry0#b8{FH=495m%aaz}ovRWvKT^(6gHrz(x z)b8(&-u0QOL2~PCk=&$9Ati&4nVURdfJhVxnGI)YK3XF3Bv|=l10;1I_pLh{oMcWOi8r+{Q34SHEs)pJ{oo_aq)sw||DT2ZP@`7~0;~69I zs3#I$J5H%2q~UjD1;=hUKJ}i~@!dliie*Ce!iTVwH3GeBFzNNn1K(x@J4$3 z`%(f_g6bnO$|HgvIo>y9{W?-9nWpl#7DIq{av4Xbr>-csxKSY!lFG_eNy$Iqink7{ zZEnR}E2#O3qUT`s&wsv`*Bxpnk_)?q zM&O*Xlx$>uD@h29?h+x$&I1ge!@V_NIqQhk2oS8&u#+qi_WmS)6-3u73o=Hxs!+RDrZa#=tl z+nz_gNEGC@R+2&>kpeat#>eONIPNK}1;l`EkWC6?x6QTs@G*{}v}AQtzG9*|QCrGr z{$$U;dS$ZD0AXiqVJMRUmGQYpw>hD)656ue-a!*b98j+UM>z8a4+kTY*QYqE@kb1Y z?D2^tK4C({1~LqK<0J8`i6MEeBUfZo41}1Y>Pa1m(3oA$7*4{Yxy?FB1R2{%4d&$wS=cQ>J2lMxzHL9nR_o_B{_3YC9BbWSJw8l$-%0Z4Lc@O0jWsb$YP`hhT`8 zjHuedhxdr>+dZnJmxf!Uhg6BeM&CO|z_Ky-9X;p*)xNWFZ4=t>SYSCo&_|HR({blL zx>lvWrFnY{Hu5SgdrC}%5=MIxd*EiNXqt75rNj#(v`xzra0%#f--^2>^}U>`fxNw# zR@s8g$2+`p^U6v$wg3 zO4i9Gw3y1CK?A!sIXTB2s(U>$-%b+E6|Jg7j$;b#zD~V!+pkLKtaQtCw~Z$ACUyXn zVe+ZRQ~q&QZl%$5XbhKA+gtV^?aBHmpk^eAYS!!fxQvo1+>&IB^dHNmIu9b+B!XF1 z5V7rdVpReE0A+f2>S_!B03K;=3kw3*QQQNy4wJcrhIE7#QhF2g{-U~nwp!Xj3^Gj-aJw_Oe9ew=?_4s$aiZ8s z6n3$Qi-M}L^1~fjy0$suxXXK)Ew0w*d#P-ODl!#M1am@H6Kx)msQAh&B?T;Gkvzhi z*}CJte+t8!S({RnTwg^gj~ilz0K)zm>+WirY4E$Kri|8hZt?^0Zvin?arLDxoDRezB4n|1r?NM84R@QPGi*3&<PP2xJJlgn3=<=PHdk6aI5=|~P{?53ikcY;NAZHIej^Y!NxX?N)sq@fTZ zmOF_&5!0M^?_CxBt&A~~SUk$z$^{3q71;L*ero2Vm}0H;1#=Z@LWU!bkaL@B=JXyk;I00-O%_U5!< zN1Hxq2u=|^(4}?*6bxmqwTHLiP}*pBH!0who&#^DS_rL^a2$=ObCc@a{VJ})-Og_7!8u_The-maA{1>r7`Ad#0iZa~j-_3Krqju=#+^4A9| zkTL2(y%2CdJKAw^-AXI?Q8?t zsDSteEz!A#6d)Bvl!V~*C)TvS%P@>wO?we``CW%d z^VnyBpHrHv9hQ+}9FShw%%gBVaSX_KC3FxY1Y!(!*6b5W*OR~uuoPDPJcQD zv9o0)5yIY5JnA`Tm9kgqQqOo28JMVe)DF2DeNATA$tBb#=E>raC}jQE+qtvbKHSuo zCVQJ#if2UuaM&2Z_xvaVE1hn8i>V@l60D#BCeYdKx9LzBlJZ#LcznqaZxBwI1L=I3!*$h3apV0(tp6ad)AeVwi{(hVESv(FWp9;v<381GX0^7IMI5P^&h}6PZOnK$ml&!b5$ODK^#|jS9BTN&QN5MPeY7-YbqtwHK=x5S(ubA8RTqr>5e*8$aEX{ zJm;Qx3E@=;Yz*VE>D#R!)V!AobN-Z&wgQ5(Zx|$Y!RzTvfn%2CWic!H13q9UKQ~p+ zwtrfseFX8`A-%fHV3mnbZBTj^Za%pBRoE_JYi5QdacZLtKHP5^`=j1~CAq&wQea3f zFiNRooMd|c07{I##Bs!_0wU+;ZJ-SLkSTom3~4i32(6+_s5m71dlD)Oi7my#hK6#i zrKJF!tT1pm>OJ#7%C`nYkx8~)&VG41jmNU6JbRzYm~QVUE@MUVm3b&~%fRe#arkDS zmOFPdNp6#*5H48;2_)y{&T@NkQ;TV2Yip<-TG#>m#{}hZ$!>E%h%Pi-EG(q3kmqW+ zCOnMtKZtedOpeSuGhr8R%47|kRnIaj%ZX-+MJl;RE%E^0-oZJ*?bqo@_6;*BMoAZF z-gn?_?s(D)4f$qH4;d?@U0<_ zVQyR3t~eYT>h3KO%*wm~g;`K%K8GH)&0X5OMT`=oh}$J41xja+m#07DOJHO$tgy>r%)K%9l~L*uxH-u_bZ9 z`Wk)2{!uF$kidYku68IVt8vnh6zwmeWI(OHH*5vLJAWUoSzD!+Fv#rx05M-AC|n<3 zo3G(eM9(Ghdz)m9fjsUzN7Ectrt&=fuwb^}&Ko2jQYf$#F7`h6dwCNLjg6MT>%b%Q zsNBPG5=hgs!oMVOh_*WtLHxfuy>oGQB%Vw$#yPOTsmWe~_=oW`fK#tM>Y zG$ChbVOESq9FF8|l4k+be?v}7ud~{i?V^Dk$Pg~gFS3ZR-mEG~Yg$IpNP=Vq{!_7l-V}W} z>(Aj?g*?k_D@MV(gF5+5pFZLMvqw!Df#lK+S@v*y6e!8$z(Ol6!=RysH*NNOyPiJWvEW zZH#8tOv&XE;CW-^IUQF%p7hJTCJm(`g<%IR=0rhbfszJCQ&TxG=yF^kB1L?UPW2U~ z)URm`={c41p3CkBts$cx(@c`~@grDFcFD<=##!6BJ&j(zw7AjI3!6w|l6f72F5i#= z=sR)Vx-B78Kx5hpj>fFt+eXn#BLYgM04It7;*(g^^+_HSyL5Hqg(>{;oK`$~H22q1 zEN}LP4YAry#E+rx^dFsdFQZ#(SNE{onHoEFS3s?|oOdIWT5;RQX%U9me3e2JZ0bfk z&=xt1Ukoe~hx20GnGyh_B>Eo1`PSv8l@)`hnQ=2QUIQGAb?APzVn7nwHv=yDJC9N9 zJ*y$^CAhkdDO|K>GD!93q4uDA7B6SHk@2U=8C5vhB!7F?k^cbKr$Y_w(TAJKw?-c_ zB9%NCnP6E|i8uvY3OLXd_6-B*>#^ zAB_>*TFDtD)&y#091Ky9wzdl zSh5BSe=+|6*Q#=uiDPjyxJ+&dA@-R%v z&>%~f46#LS5U3@dR^S2L4r@Zj7D)`IC6aUV2L1pK^{6I{4TLjfzSGZa*1w4)HZ}@v z!lO3TEC)ECW=^lHGyzvTBnJw7t?CK&{{ZV(KXKw+QcIMM)uU4w!^r9a`?sk4D;>2N z=DjZ|-lbXmwkK#HaeJ*zi0!v?Tk}IvOA4_ueGv; zXrptrTxT56ftgx&iiMgO+9T313uNKW2W0fsl^&8!$b)r^UqqQtgcFTb~khjbXZXoNXqBuVsd}_ z!mixLqBaph8JxD|C-`y3?xPh}A`#J5MnfN)oYkxANN2I0cTb+we(wb1r+-?K(Pq&s zmfw5NCCjn@0AD59ptg3&cdn&!$Ym?35& zSj&|kOb$8xe+uRF2J*EekN{632a$^JyrCV1s-wDzS-0Tz`LX<|zQDCGhM9fJSWe4s z&xVfZG;Ex72fFm+RIqqr(JfZ`=5cYkhnXBh3CZinTG1Bke{FjV<+iNCFjU~}-O{$D zf-6W~-4HZEHsSg0*Xv5-w^Jt82eoo6qlQ?5Z&Ywq@z@G3&zw@+*+7$A0MZ7R0rC2i z_*T7wCB!@AP{(fru=-UkRphsdIbxBOw*U@vjz>)WO&}`VSsNo`6c9(}tKrVpAdckV z{{T9NT)%5;a|i}Iz`K@4z(ME>bUpa4SnHS8QJ7+sF!M+)Al%LMCa&HIF4{wLJhK%F z<9l~KdUmEQ%hq~}Ut73^MEg{Ip^`GJr~RJiJn{9a&3AKoGbA$INEC)-+5ija-NF7? zsaDo4AH<55c|LNGFC1 z^-#G3j)&_&Ggf~SHLd(jE6gn;`D7NLC@wpk_3gzi&x^F{)s#C=3~Deo9LXoYJwH0h z((!R_Ead^Bf1jS9k~4~&F#FcY{EF3h|Gh`b{WaY z=qpHFm=)pjWGx{C`H8^*9Bl)xeT`F>Plm$N?C>;-qj253=I`8Aj*y1$NsP}Vj>xR4 z#km7DC0fR8Qr@3AZ{m~An=cd#+yT>`fN@ofvq5X-M|upB?TunA0FNDT53$E;)4aa9 z(=V-}x0SX(-Ve`uO@G7IGF(e35(iZ*2yXeH9)@0>a}~S844Qn9m{`bKH9f`-?2qeA zytcf(dG3{@j?Wu^c5n)Y>_4qZpy?250b1s3mS%9I`J{1*p7s%Gp_GZ(0L+;^NY8o$ zp&VKTpOS8^XSkLy0<x&5IGN)o3LiUj&M}XAlSsIkpm==73f*vh z>Bmd9X&Lb24(*NINa>vOKpM9CskpFH_IR)K3-r`D#O(XCQ6^ zup^q|h1{!e@{ka^sX#tou03l~&zdIiT!P!~G|xVMx_yQGPgw(f9O zoOB1c{VScixt*`3j^^CBW#OZX^ADPgS#KE6hx}U zLO+O6p-oD{;IuL`ZV83TlO#A0V<((?EZ%VFVZE!+oH&3S7!mQToZ0C+ty3qrpT}>8 zhQH4e+xO*Z23^h;%>}trgVH|2ILu{wsn!i(Pg&pCXi8|ok=ZnA0cY%b4IRv90i8zz z8jGp0Q6Rh*_!hw#r2WS$JhH>k>&2^>#SUnHrfl%I=8@POn}<{fwJh|!Gb(kfyNp~l-;%R8`V+#|p@wb?|BbKb^;*WurNKt`dQG1*bGLB^-17}WZwg@Mn z_eDf=pqY!Sv$?VTKP3kf8$@x zM0WQ7T>Ebo(k=IY>i@Ks@N|_@b2WA~|99Z8c8*rYtcYxE|7rhkl?{>Yf14{iJD94O zyCUjAh834YWKlQwbVX$Ow_nl!?#2GQmq9f6=YIYxKQ}L=%YVfGM?N(-6IU-s^Zz!4 z?0^sf93AZbxeZrmH^>PsYcp3XL~d4AL>6gtYfCFvMBabOVh(l=&T5XvrjRD$=I+*} z=BiSVaYe0NU6jq8#T@J%9f0PLhsX zKD+;ZM(+#n8-UNUk}{G2s1E=D6yyi+z6=lpz{A4A!G45?gM&jrfJgjKh9U=iK!butgL>})kU;M9Bh8wVE;pNg7>mX4m2i<^g+k6%n&LQ+avMpjK-LsLszN7vNM+``hz+Q!Az z&E3P(%RBIAP;khv(6G4pgv6xel+?7`y!?W~qT-U$n%cVhhQ_AmKYzP>di(kZ28X7m zXJ+T-7a-*1TiZLkd;156N0(REH@A29506j($OQ#}{*PFYzyA^J{~;F|M6M4oFwih? z|HuXP!2|LOjRy0Pj13n3s|uX469zfEKRl*rY)*9-0tJWa1(u2PBqBB?=O)$VKcf9B z+5bDi0{(wV_J0NYU%8e5pP->2A09LsKm>5zrrT@3Em!P;6{+<@>r!bRc^4kY54OwtOHZj;IiMB7E$<665^QHKb7HFApWhG;m0*6LCit~cP|17 zr8V!y(nK6gFcsDq>b3YW<`Po7PmewAQ!~F0q;@!!tpGEIG+#Vl(2h3pw zf+^?+>qyd(vSlGrx#mlMnOF0yS6HgbNEWpz?ne(C>9xyE!>F=o1o|m0i_i7m0gE;5 z(f}5_Y0WRwQUb3TfOminjZhi?+pK*B1$;jv!eGVBA(e8HQ|H)6mp96Kx$nzF`7xIb zAb!e;#NLZ@RHvl<5t5-a#`8+%zy*CTZG#9N(-d=-AEbK`ue939e}4IK2f z1EQ~q6NGIkqGF+iX)Y8|%+)jPLDon%m2|^uY9>E~>Pi=e35^6E{>f^Xt{Oy(e!${? zj9$Es-oj$%84yk!A6e8S!;S-C^wF)Gi(Jow=##1Ask8E`aq9sttnYw!Yrj!qkvR-m zy*Q4kWJUeK@HU*4CsE87YpuZ1j)Sw zkM$^CfAkTM+~vMm9+?&C2^tXrF4;!>QFj zD_E}Nc);vHX-)V95+|(Kh0rb{60A%%O+byyI!2C5&>Ya)#d71y6vp>zV>cpC;h|0< z3!LE4WYB!nEOe zw%05_`_}s>Eoshl2)QofJAk|V^t*JH0BOY!I7*b4tprQhlT_FBR(t)kE7Q*Yv-!&B z+?skMP@T?pV}Z@66R--E?T(R6(jbNK6IdfO&a-dAi%35bH}RDv4La`t^xNpGzr%V|Ns8-3zoOew5MO(RUP^amO6 zhf3qM4!-Fga2TxtznL`nv|o;Ih8@{)9x&QRwjx9j^~4X8cuViHk>?s~NKVjeG}*5< zmujF-bhX5?`#?fD%?HqW^sb0+$rW*dCV4P3TBtCWKfN#k5+c@UXbZu%oq+M-rFv`x zsAur>bA2MrrOwDSQy=aPscXqtL%xd(-ODlF(G7lPB%8Is>q5SWL;=4>_a`sZ_*3mu zutfIl6xxY`e4)Y+Bl<}TamwgCUcl%gc^!6!IAQCOQq z?5FV5bS5YJJ>zf5z?Ey#<3|TH}LQNz3E zPN1OXYyTqlkr6+LUU^~3C0H%+m4TTJeHd0{PQ?$y>EKIp*r_h#EnX4&nE5S|0N@(I z&}|7z^kqc55lItf(oml|LR(&KkiC0Bc)pSGl^*_^20rhxYj=r61D=P0Rq2YiD#_4N%(qvyDA7h~E`wG?MBeIkV)D5s_)BPOoQ~J(Olp8!0)GadU`Uqf@cmKp^bqIhre;tcx7DT=l2?*F_t3+uazy2uU|(d}I>AEH&LxXJw(zy6eA z85X7>yB~ZlA}f@B#%$rX>{`Qa-o$iDB=r`#cQJ+gZ04e$Yoo=;N+}W~aGER*0=p=v z*_NSLb5yDsPk_5#g6NVJzK+&+JvCJ@RREQQUAAA3tp79%nZ-b&%8T!`j&Ll^v1UUx zM>D{|-#DmwN4d_J7Vo9(aEQY9npEc{?((V`Jq^95E12B0wMMe=@{}}HY1QM71P0DQ zb>IZk6U`3Q{ugVJ`c1fSTdt=F$Sd_aa;oaNXX?Xazl~6Y;ZDPIR`_c*P@@rv4U)T# z7cN>2Fc^GAJR2Fq_n zY80;%a5t6wb}!kHP)$P7U>;eD2tS?hd z?f){ZpEKLc(q>$(oWg}}8pm_)AN7?rwAnYs=vy>f6Yr^VAXYwyHso3r!tJRF*Ug~O zs4u;cI!;+0Yt?-pKh;Xq$PCyZ>Yr=svM2N3u>-V~Rb3!A8=b-lD2ki;)vE+3Zbzh9 zYIsJW76ee?D1(>jHzVS3t-@f(8;&JOWpFwN!z2-SB_&;bKw=Y{PIl8CqN7$Wc zH`0{kvKf5wnmLD)aeC0a52-Vd>S0|~BIp~m8z?syG_U!xePBz!E*r3geq}S)(1UvG z3=84zG0eFL7$Bn2SJ>=o4vc-<){Jp#|8c{L`r~|0V_gsc)hW(pG2J=eQ2j^3ySRqc z@&ocB#j)W$aXRtj6=IAv#Ea}+-$H$G1^+@>PF;PMM^T1DiTo1BFTJ9&|Kp+ z#LTep4yc`e&W+J75~2O=A!sd3ByknUHHb9vQAIU6guA<>nc_&~Z(D?R(Y*xALh^HN zq=c*2hZli5Xjp;3cYwXf(=54)OAP{ zyiU(8rv>cZkKNe=^EQ*tPWn+fR-5M}!?0X4?D{BQuWBBL_QIU6gk&0&|sf3g% z4hfa+FLVVt1#C|?2yS#9L>=_RO)D+djV}5SiqW#F>K(VzG@t6>Er}=^%MXM1SVDTF z(L)tDI(zVv2;Yi`Ad^-_jrgI7Eki+*d;q^iGlQ}F8ddUX>a{Lif8$d_pHNvr^ECB@ z13rxJrF20~Y_Ra}dA@>S50PJadJQY1d(3bW!g?A7#rqZ&#=6zdLQM?3=t23Bu|Dhh_C0YmT!+GW*MCpw!Si}?$w6b7ib1dX{bzBe2cxtV=2+<(AkNg z8^i-J#;ysd`IY`7c|&Q(B^}MQ!*yvx{%aD+{YTvyJ@kMfHdVjYUnlKhZ7*L?8F?7)vWb`3yml7*ggo!!kJ>Xq1PLdd^^947d87cF09+ zwU%((8RL$FSm3U~eG_}6&d#1Ylhh&ZyY7QmStC|eE-4=-h%3CO2wZh|$J-pbbM?#f zM%>u;X^48QByz<#YAMnAD`6ro6$fp4M_t0uV0EDAx%ctTm3dNDkz9@zE9QjomyOxQ z#_o=Mk2ngoVc!Aj9hr!1O)eRtdG_X{=3eg?Zcms1u5R(v_~1d^1sifXgBc@s+>$bMzYlQv;oN}UXN}&0we8FQp?V>s9?85 zT022-ZJtY&_1{JQil83yhXk`QSJpl2l^}1ETIYUJhcSfp9W2acsGUYBE9?`teU^9{ z>DX4``Bf)zTyDL6?3%r;{0{ibu{j~KgHVVHt^;lA_>o@p1+s*k|I}5?iX!KA^@pZ= z^6#~RjeltA4(+!6oQnf%|Q0iAo?qj~xw@=$v5 zXlL$9LSJ?nfz}5g+190!x|hJYCZRhdd6#g%m~QR4w-p+7(y*HJ!o8@TJ297z)*jFG?wO-El`q!> z2Yo*wVl8k_G<}4B+RHqthwo3Vi*k{Y8`f-Ut*eUD{AOZTgy@nHK))nhK0i0+x=~Q@ z1;w*Z!mYunmpHjDgFnUqbx3O=te*$HUmzdeAol9Vb$nW=?9_q{&m_ScZ7qLrkk_5F z@U)G5Xtc^8Nf<=4zZOAeAVFKira>mdyB(_cHjH(R(KTFM=ZsaLKTD#MTrE()d797g zG?8$?ur3uo)~}fGy^h}3rOx`BK!r0*A-oJHMFGHaF@rLnLfB(x5{R2dh8P{d^1Vou#~UfQDFrAaVQYO&S}5-QhlKaHENa87}#PZBF=pz{c9*g#$8Gw5eFi4X$ zN$lfYk#IIbS7%xalM(S&%)Y=+`o$M&ay6|Xwy5lB z;)N)x8q?N+_Y`YnFQuXqasl>r%@xXYo?D+d297nz+Y+}UyY$)3HrWETxG>%U=(fH6 zqdRX{Ar4;XOX3BgWPto1kd?P}UN?w~PJ;1TjwA_N|7^@pMg3bMRXo7}iL97ojlLaB zOzK|v$3<_xB1abv)B?FbhQWI(53|~4U$e8Zc-^-5;f|#}C_3Dd{vHVqjp9b^E|c!I zXvh)79b+0A5jbVc4(E#00-r2*c0g?^y0}5*e6%QCU(o%ZySrWv<7ijfyr+NuGIL0A zIhQsQJ-&v{_J)l&!F&{Rry*is&5^Q%d#Kj+A2cZFH)>g{r5tK8<=utRP#fp|+aLE~V7(QW)gtVwk!K(LXHLAxX*j?*IxzTe!u%nQs9ETn2IU zK(>Icwt@3rZ;zmPeBX=ympY?e?Je-h4Ha*5g}Rw^*uXmNJ3xDVtVED5yQ!YK$QL%% zm}+*=7=1|n6pxS5Hq!z3kR~Vl?G?XN4}*8`#6BO9of|QYmCU}&NnKC`ym0J2rL9+9 zkg8KN=pS^fC8$|roh)mb())>(^Gq;zxl_2qf2+>j`*T8kbtUZYF6kT0JXO2f(q#7K zg@b7u_A)OS_?RF(!EKVUW{T}d7KUGEehznV4|B{J3Z(l?QFBaRe@^>qJ*wRzC8aAF zpE1N%T}oum-mvC#>>Yh6mW?hFoLAv6SxhE6P?i;okJHVl3&A;Kg&+6c+VPcCb_8CS z0dd^f(oU((-=<>h-{^av-3BcdIrFl{4VXNz=!a15nv#Sm+NKzLs7Vs|zD=L4bhin$ zI(hlmFgSYRErI%K9d2(V0vOWa9q@RJ_uYpAY${(Um+ZcQdg5klUstAfnj0JQTJwq_ zs5?i=x)~~K(t@>BrTlctogn`8i^{;$+jO0?dh9kS0!=~g)y@hJKeyqdry2bAO#CkE zJF_2H3#fR48Cr!ed?^M8E>$b;velXWf5YQn_e$G%g@fJ6C;Iu0yk5Q zynKQlfA!h^oWj^`5d>8_yneKu6x7VVR2qG|nup4(JY!Y@cSC}Sn}N(fN1Dd5`aOTz z%`Im?S~S0uHIX+te6D!)b~M-cTcmO-?m?B{PC*TaMd)5=n-EE0ZP7~`fs%|y(lhg9 zrrJ;fATSWyCw<0mAm5nS`DCzXeYK(BbT8zj0r~n-L5^|Q^qT$2hu)ppv)c!Dn z{+aSHvC z1CaNJP=v*CA*hQx8sTe#9DZM+seXtQ*i*t^%AE>ITZAWG^Dt`#@T(rNp41x;d`$4m ztz79GcOy(6!^{B&&Znl~TS=~-AvR#pS7I;(-QijtdjnCY9@oS< zVuOo~%IO_&_+$Ij_Mu0lFGKVC%?|?L+hO$T9n7#$Q7~lqexMi+YR-3H40qq(dvdMlP^fN;)swo!J>6EQQwjQs+B4nW zi}*SjU4Si}zH-@Qnnomp+fCXYXzq04e)D1* z*Zk0(m&dbs0~R%2XqJ8l^cAYgGh68!bYq$~26y#ibsfbs^?_ZOS1hemvRuMun;!Z? zc8!p`>h9Bgq9<+YIxrZS(bDXprI=*!!J<%3RCcO4N=iEr}?3Ri{n zUUNN<=bCpeZ_68M7_9hZf3O$V@WMdIs=!}5&~t-)>Mm|$B3#i7WFh)(N#Py;kuiNA zkNf;6QNB^fSa$|;&y$P3QBU!{9GJkJk|4kH@O3GR<(ICj`*5RTbLh^V^xOea`yzw7 zkHnS`NzXgLl->8wAXtv|b!vpb9gg?gcQxQX@wgfl(#qG8)>zzy4J@G!Hv`3l-(N{u zfOOPe&2EqEqsUC^{#+}Eo@XiCytz~hrs7&-64{L~(-j`EdL zFFz@?bchJ6pA-ZVE|74G~CULS{6c^&19N`CB4bXDi`+HHuJ@L=rQ zoSA;LL&{6N$zO(5(K`ZL5q6yd`|4b7j`bHCwVI|fonc8}H2hy(3B6Ru-z0zNYL>$_ zBcXRysg!-4bVmLJ*aXm7n79W!lQ?k+tuCeOSx~8S!Y|&83as?0#A%uFln@=AV)TU& z=~N+G4(99h8Vk+L1FyiW+t>&-jPTlbO4gQ+eA;+IOq zz8Ykrgz(yydB#?O9Sa(r;y4UjHtTyM4&F*n_#Ly=HIvj(Ik&apbz}Jqaqy z)5kTD{oHrJ0ap~=_iXl(4$qAjJGM=MO0Q1H!pW$Y4_%w>tdDcbg&PDHx5yu&^QK)r zn^W-;9ZoNt7F}c}cjH!XfA-?7kAKp#-WvYi(k6>%Pg8FzW;e(PT<6}<^FMJ3)c5F4 z#$Kae>yNG-5J~jlU+QJd$!QD?saW6FXyId!o#qgf$3_jgk1hZ8%(VThiVtrkx7CjH zk>9ma|I$UwVM9n1GtzVoWB03n)1T`ntyqb;4PWbgS7Wzuex*-8mhQq(U+%_5F6eLh z;_qo*kFHq|3r|5(xc%o4^Xpbd>mGDLpWPryx@}?fkSh0fS>crZjTe#*ixK^hB6rKx zKeAwwu@BZ?jU9zK&k3oJ1kyzM9~5yV)1@*cVS`p-R~y&r&*L=XcM!X@GO}#)L_>)E zL?eKRUR+K|KgVR5$DIX~VUzi8K!=8t7elkZFIJX+37p9}9oE|pz(8@9xKor0oGHIL zMKLVB5e+pGW9a;#7u;VdEs}{Q zZtm|j`w?kF6JqVv@T&Z5`%$yb3*@di^Bj_`(ys#QYnTdQ)C3U)veiJBajsIqB3&>9yfL1%q%P(JU)bb{md`EhNbZjj z!qO$h*&>J3(YJ{hDlg2?7Cd2b_u?oevJ}R^{`b)!-KqHdU(BmAzIS#Dz0f1fq61VW zfM^=Z?>D{L9W-yBmjg4*ctj#41PIt*_Ea0@4sq!zz{l!Lwz#A|zjMZmxgslj)cf9Y ztAh6;`c`YCSKK}!YtIn4EjpZQSPSpUZ~)=oX2C(olrko78YlAJmRQqpr}|N{iNYaV zQ&P;k$g2AH%-z=pQm64xDk@9d!0Ce-RgUTtv7R?dlB;n=hi9vIz>9*AkHHq!^PHyr zO_Rurf+T0z*gzXcp0OiY)6_N%$<^2fzIHoP4sX0ds$VoD zfufRshRSwS(^3k#d5rbD>sz9Cf4A2CgJS@1NBjPxaxOk`2uW|G&_mqRCg_8$f{%T{ zNU3>s1}xskje$3fUhM}>0x20H`70Lr8@NN=U5n`RG8P-aSzSvei%KZ))TDmQ^KY@+ z{xw!*w!={Cx@-r&dkIYM=TI|gN=lddbm(kT!|+`i^FEQ5UfmgSoa9R#u|57_3Rmqd z(Xsvs#EsUkaN(^I1mXuznprDeXdUN-pazJV{MAsn{$RtjSwy$%GPr{zxK)JdS&}l| zF$#CE(;8L&+2e)Qo3t&O#VEv3b=zvYsxTXx+)eCJ^W%8fi$b~L`Xzg&2WXgASc2q7 zd890nyY#2vYh|kMaDEa@cFZnS$z5zU*)eY#S#MDCmxds+8PxSZtuJPv%REA#Iu|jp%tR0}lo0>L&Ljq1u zGaPIBiwf0T1R)gX&=gLmb4kDV+|fOGpntwHP-tBoA10x%dU-8Jprlf673rgg?Df}#XBh&F_{3$nojs=(P-RR}S$sI$fnn;lT z!-leLTe^iIPQhy8udei>BuBiO_91?7X~bL-#!rP(b7+9&2dW z{_v0RK51g`2}54HZqbX4) z#LsRI+=B2V)df{B?=kt;ogL2j0cOim0+ts_;KHT*h3g4C11>3JH^?XszMRovXa?KX zH?*mydsDde_ix3^}Dk>g;8P3Tl%6&iM4243gxp92*@y_g*&!-Yc+XV z2rd{v!jXg^c1gd4r+YNR@U{Xy2c}J|yoLeAa!24`rR&AvvZ>4AK8nZ&dGn^7{txRVI{`#I#uGT#5N*Yj=l$vTScDsp|S4QF$$VJR_lFS80* z-<<<}4&61rt5}y|?D{#X5Rz&mcc|@*M2xIHv5Y_4?1LN^bqhRWpK5XDztnkhZoI26 zCN(zKv`g!w4pG^LibRDbIo52l$8stJE$fKfP#APXXmf`mXbx~d7+E)%fW1+HpHfz) zq@O|Oh2Z>K>iLN|&o|v|FzRBfJxG9Rm^DiB!#cs`)qea`{mo%LSn~W88GIJ2$AZ>l zt~JjU1XKAAKoBl+O*&*)W{gWnjbWv5dD1e#3FI6L6Db0JPNVvB|2EO%P^HUg|?rDHz4%mi@XL4tAQE#QCDX~ z?6dn#*Z6es7TOcdD{DWzGc+O{KVIb91O`Ux+!&fQKME$SNoZE#) zxL+W}D!Nf`f|U_a#H>e_{_C}dBQMKI%N1X3`r89o@4G+_B5c5yFJ}xn)@C%xoF97J zsKXO>UHFIKW?(XgUS;xO*h=RClvkXPlB&ti9dn+_6e3CJJKDOMY)QV+%n}iFaY~ zgY&-iWX{dPOrz11#8Yi!dlgRH@&4rQ8N^Z%x1vOYF74PR?e&l(lm}HkcYTHn4qxh zcF99OZ>C+XYF#GUE_A-RZ8oMw*6&poEOonJ)^HctAX0pA+M}y<(A;@dP<@bHMIrH= zPkpg)e;c04BSP-3eu22%D-Dy1nExS9%{J6W#K_YZTe+HxhL>_!Yxux zPCWvMWbt~U$F2=0oZ0#dt=N6io>rrbmL>>=Uyz@MtcGO_>ZI0P>o^eeq z2v7;*rukC#r}d#^^ENiJEvv{a`pF^|bu?6@z%ea3+|Hq1$D!|6uoo-p5=&+R)Z5TU z60o=)6P^D01?0sPuVnR0KAzSi#uNZiA&oZmROC<|<2;QSf+!O;88Wt-a9_kc<3j4I zrkIS1txmcdUvDY6bdr0r$P+$4?Jj-b7u95W{U{-yuT>pCGdt^g9C}W4QdhFR`nt20Yg(E6h9!sp zUHD}7ZLrD9)IdsoG8P9A>PEZRaKFieKQ{raJ4ZdD6L^SL%376%(@#5CP`fJ79ZM`@4rU%37ioPg23(m;NXlFYxb2!BMywP8aeTtN9bB=rtSqW{Blx zTC#gdQF|UORV@~`-kN5OqV$5`YYS-Z?XmGhJt|JGS{{hSXH>tE6cvfvw*sR$C}8r1=4&(qZRV$u$<(zhneKjJfLFuX`stqY7Aw~VS=0l9KUrWsg42;m4*6$dQR^;C zur!;zc?W1RFqTF8Y0nZcN$M3gUsIs-U^4gqAR@Hxrop-kG7Jh zzV?a{9or#Q3kFrUcxkktJcwI{TKE#0aL!r(Nq3i)_Q>0!c6*dWV-h*dw_?te7Ic$> zvI3+zXZE#CPLc+S2d^E?S(taBjWmn+ca^6j{ko-$lg746bVLAc3C{{Xxz_d!P)6*S zGXK~xNCiUn^ojFvgI}Lc$q0;SvR~~?L?n|71jYekJ57w43f2OO4NDh zY&K9Hh(C*lsc|5VFUEH(rOR-EA%=vwLyM)BRT&=3jGW!dCh?i*$gCVYz#z!)9NCr3th_yQeS2sC`aAqA%~DaW0Ac5xHNXD4ZF39bt8}c}feV2k zdiu)%Vhr~*iwvM%&E_R=dXq5s;h3?uGjpX-%2N8(-Tz zYlL-NWRB$5ZTuRrd8GiY4nbz#BHR_IbJ;*CaQlNa%+NU64+~V8J$E@cu ztW8ytyK*3?O2tprGVHW>o`UAz`Di-|tmxra%*N)ps?F|=^fn4N_pd9H-bfHh+JDrF zZJiip{UObUZm&gy$E8M(v^I0lisMDd@B1V(C)2rcYX954e`cQg&(F`EmC6a$qwdt1 zN$|uFuS(NkMUoNf5vw)6@9l3>LOYsNwR8SC*la>pn;IJ<+knOGrfal4JZKsr;e-;& zd-Iuw&=U`Pq2!oD=2Yr!^lpC&C|m%sZCqp4(tVE>EbRpCuZ@y%r?pChBW(VL0(@N= zGI!+WyTpjg^ZPOyxvFQlUzNE*trHEKKfX>R;pvXia%S_ zpCw$FL4I1wyK7lZsJAV;rGuvr8h44E?~uI*L1#ZSq@g+wyqIO|F11Pu;b@48Ew026 z+CL)n8`(iD8%^r^2hipNp)^TV{JB{(FW01ujX8N=o^XEkqHwA`W(nnbyEOqUZzr$u zd0Jb=XF)zxTsT7fv3RwpNpN{&TX$VE{6VTj=a}~0sNN;p@@Cn$)|>qc@FJ2+WRrn zxdk2Rmy#Bd`x}=V&WGmV5UYLKia@9J;JcfOFz&US5T~@la#_>g`fI)zPm^l8%)soo z=IDJ1>2a(0t~VkpD(N}}kx_bgJYy7u)0R2NtcGgsjZ*UIG8028;EduPLkZ62hHsq6 zng6b0M7pavw128ez<@!c{u0B6yX(Va`I}ztr@)Lr$YhtWlNB)+VX{p{V>xjEt`j0Z zR*VKFS|fcF2(-R@iOevyHte5j7VuSEnpZ|Mu+}x9m#fjhriB+m*su&+YHXNIGNME8 z{9D~>4HWF_G$Dx3$WC>6NZavI;LbD!xj2=3hRZkpNo*8qB`fgRe)FnO0zQ=Q(+Ncu z)3e!Y)Thqt9dRR-=;^g*+T1lN{Q@RoLe=)Mn&NeBykSKbA2m#`%Nds(&VfwUjGG$R z$Y3n3$r^&ko%AEmdu=gOxRRnA(rD*s6_(0b|Kem01iI{Gc^T`^Dy`;(gQGpUT-469n%ULn_>4N- zSZF(0+0NprB(D}JDtAxPZH^=@a-J9TYP%m=@T~skwmQ86S;G;ca-s0EPcy3P$@ZwM z)TI(gq4&1&-i5pc1s=merUxp23G&tkAn}ds3=r3Sv=(ZakNhQf8TE3e%K$A=^Lzbdqb9iAo>(Px)d(}wBjRo8jNhWeX$%XHG3ceSD z`%s$C+OwP|xq4({hC9sOJW68{M>r+-R>77~K(v7mGso4zlE_MWQA0h>U;3Mh(>|cC8I6?( z8GTiR`nm!2py9&qg%-}#TEl&7)+dmGAAOSFe@#?lNg$A2;1EFP9vIl?Yx7#%md3tY z6SswH8n;m@jxuwVvMppYni$kpjT%as{M_s9hh+GdYE?$K z2s1v_40Ma9@5rGLx|ojT#a+7NSD$L(5MLvivKY-l*uYoFIcd2U;xJpS>nxq))mg6h zT+RDgYEbBve%nHBBJ9d@dxH#aVXv%A#&z;5x;@??bvk=03eLm#x!_oK-4>?5m|nui zXx*CW6heA46Ks8Vs-4B11MrMZ=QFe4iAZ&&Db+^;^F##v67J?mpQRF_eg{U_ooj73 zjq#Re6kv$w_N8R_o>^-W((Yn{{?Pj_L&EjqV^Ab4-L1`B&dY{?NerUgXZ@>30%`5Z z_o^IrX%g};`6fwvdbZj_dVk9pSJ3(W*V3Hq@*H`~?74C|WWb~L{4~vyUYVm?XEkG@C-Kbr{jcwq9uS%P$l)}T2s~$sgr_b-P{Y6d@}z0Q{FFRe z_)|>`xwmCSZxl0z*@ZmAl*pDVkK9bU8*faoEd*|<&I0XFvwBf)r(Ka_esviFubeG0 zozpzexlE_$6EBYsN`Lsm)F|=;yqy!V#B!juRG19fH}F#8r2V49(GeSg&MOkVZlRD!wUON4%V=oa(upU@=)!;?^a7ynu*V!VqrZ< zh@g3thuJLR&|Haq%S4SBIAK)ywRma4MmKCRM4B=MqlpEnUtd+byF+G{IdhQ2%0A;l z$g>pRr?$Vm%dwJ7drLpg+e}N)T2@scQMfB2qlG3NKfINV2e={_A`zl&|HHJv}EGDv*_?&l*o9J z1tR){SXmBK0}hLXun&x9L@_B8&S%HbnWi4o4X4R*%fmrUI()0QK)s>Ek+p z$p>#w%CU#UjbOdvKNqv6gDH>iSB=dfj$q#XE8Dz4s@7_ZGNh14;a(wpxg?LfyY#ND z9GVq>i1ghn;zgd9p=s84+HcxScHuJ&yG}hAH-Dvh)$PJuT?rvgzrA9L7<{Tv11Hk6 z<5?vm13r})^Cpy{hGM4*0LiO#qP0lLPFAyLMG1R3VY-^)-rbkxQtV}u{68wY;p8u> z-a1_Qr^-|2CmXWatZPjs?80VSVsrdK)lcJE-WrNaOZ`jDRzj&C1%d8s8tL-g7Z&!p ztEu1LSzA1bc{W#|^H_f>!agC@l4T94>KJ3dYn221`&9b2q_=i+M3KqnD*!wA^S)WVzK9I?am4040S%0?|vTKJ|6gD4cmO{F&iGZ`G3g#YUoDNCZVL_ zNG3>KGXo@n?j3@RkGi=1b5HR{{D5rKfY>n8*;Z*#7(Jq^AvOdzr zK%@J|0=0B#jC!<)g#;xGh^n~z-B0IOpAo?OKAfcE=aISZz^tkoLvw?=(__^24G79J z8zhM@xI>(NwWljUCAN|ngkm{@5EF08c>JlBj3KnJnSuS);Y0GDy6KGlI#rAKQ^=IX zCz$*)DkyuCli)p2n~EfPJbs+TA31QG;P)J$uv_R+EAU8FqVKOzedz%Cd8jbC)Oo&q2Q?&E))2wvs#gV^q z23VC2*yG-=Msf2sY{>gVp2YA-(dxcA@jr-5Ti)vuMFVxVwmwXMlJb8NYXxG2 zHpqzN@^RL)lHA20l1Ii4cw}ttdg$XfpBx>!1A8%?Tnj}nw{OcOou5PDXxe1=!3cYlslDv#` zp~BAB8kN<@pPhFR&$TdZ2j-|%t~zaB&W|SML%<(^H2Fg1!7>4X#YWR9=+!(HQkY8p zjowf16Tti_!$iAw1jZNZ#Yel;tH`P(R>%h@+K@66k(2mVv|3c^jD}n*bpoVK4mBS$ zRP)6S%*WmaPAWgOA^rnh5YlbB5t?|@ZB%{1OWMRaA9I@fNI!Nar2ABZ?!?vAeTvyf z5d-*Ac{c0+04zW8+C8%D9FMun{i=WYWS@(e**si-&EB_*iH=R#oUVDbpHU~H7n{fDcXu89-vmo*p!3&pnn<@NLgh5 z6dtWt?I2GhIr(I{xa;yS>q1+A2taarlhoHsCYx;vkY*3;D>~-l(#1(XfT2{rkwD;< ztgQu)8=P_JRCPwXp6AGpH}a##)ltFss}O4AB9?bgBleaJARdExVy4l zq^yc|(C_l(yp6umZZ~oN04)^ox%wK@-ziYbwKMqDD7K}zjt8+KS131c+TeSS+LIfo~UF;bg{J~I}YD5yMBMln{_*1UH$b6PSwf4 zHBC7&C(4amZ9M80GU_){J52M(97?@P?)l`;pd3|%wUQlaq=p9am9ZSrrb2!-Uhi7H z)7IA29w?SLzvr6g>6}yJ)^!z~CYIt-*~gi?l6^M{(#+std#tEo@v9y;t`yH`74n3sHWvQ?_v}+n ziKaO@u($sJzA;p$(D{7Ko6QAU-J{diD}OP8lyiv(rfZP#z|r(Aky+Wr1@y&P%71jX z@qa!m(ME%&3}Vd((U^Z)=Dbbt*27#rZKceX6Y;wP%tyyef^sQd>~?muJj+sNdt6H+ zNZ91D_oGCg?Or0WhjTJ{>ye(LHD|;+MEWh&#f{5(j+p)6=)mIyR*%E444H1P^=qt% ztl!Fu{i!>jpkL=(Nvk$yqpC-2_*oo`oQz_q+YP$?vVnle?~lWZyE|-{F3f|$K9rh8 zy4`q*S)Gn8S-zhz70C~Z=S*F<8{!64vQ}e&Hxv0+j%a#p`u?d7m1<)W%e&{5u%LA$ z*T3q19fwo3dv=+WvNEac2^h)tuQ1bo7wa|})Dlf}w&D-oB=8-_a6c|9p-Xm!Hl-_^ zzlbzD&kO2uX}3Tuk&KBHMP2Ig!kYodKR4r9T7AP?YF3vC6cf928}X%9Hrwq45># zSln8&!xWp5{{Yqu^{U=0UDsBh_07NZD$Zpi7F~LN z&_F(dv#vF}fD3-?f52)*s``Brt@ z8}8vxBn_O7hm+oub~mF-+DR*$w_03x>bB=mvsW*?0U#!d$xFKkZ#PY3*P8%RlF70! z`{j8wR4T0MHE1^Dac+32t<*RG@<`%@ zk9RC;WkgK*Yl3={Qsrf0lv{TfDH-+0YHLdJ3}*w-QMbJ$m`LgoY^^EUI5Y8n)kFegZFhdtt>u5 zSSEdH>ZB0$c$EJDynT;tNA zlTw5ZGxz~q=`y!)2el`WpS;A>Dod*>?Ad)rF8kQ~y+(C7`yvDHSwh`CDc)8G1pYN% z&LrMOi|TTOEH>Z_@GCg$x9(4uZ6~&PphiK?<4REf0C>|%E1*7M@Lyj`lREihu+KFy zo%_gtI<4iBX^$QUG}?<{szYXUAcz3E0U z-GrQ;#+{w>-9^2ZY0fw`Z_Iw%zdEF=gbJS`_QzUqje~AhA-!{1wHxSYoUAfQbze03 zN`W)RPHNqZ_w2{*Y>K^t_02Rao!pv#ljnWrImZT`jatIBwJoqfIbv%EQPZWly=hWJ z5#Vjkaw#rYixD7p4d$}04mx+Ktml^IA1#htbSJG^P8yQhmW*3FC~VcHKfGn*C!s!= zq)f9CdCyZ%oz$|#hbWu_{o~xyAUMFJ-Nzj^2|8 zRY2M)W9Gw;y$r0M@D}-(_1#HrV0tuI-WcRnX=Mp|HgyPh+C8qDR+I1@{Do)Vj~q9-_5AC= zy#3!V0CgVKddl)^JIF6BE*cw%7(8x=l|HpHjlmyL;2K@E#%W$EzrND#?p`>9NSlZY z#%lWbitC|@Ki0A4D$MB!98i~f6rQ_ z=p3x`KZ=`wwjse87U{d6_K#n}w0teETQ;6Fdw6i$npq_EB>w;q_x_c)s`ztE)b8ih zuk7w55H2MPC;~_0XVGZWtYVQi*;uul(zz4FBnBps4mJbN1Ms3tqGkIhdz5a89UH!D z9sEL`8q}8W%Mv5o>^s|nOJ|SDx>nL9y0w~LJ}B86Y9qHUbAgYkrzVYOV;Hv8Ea1Dk zm?BFY>$y@QNhNST_kJpQZZ{XPl%7Aig(8;oO4yR9XphXBhI-`n9jjdU(@(qG=S4b8 z_jw9%d-ScnPS;7jmrsiHMRRQw{4Ak%h69X_xC4($&(}iVf-3}$7mPz|C9>{kzyP;QuRIL%TrK8<;teu6?j+ML&9p^< zDx0>ds3eis9`!9vZ0;#Jqs*qzwOE`hSjtEHBQ@UqB}cUJ9qLYx0Fuk3gJ?K$>r}1v zmAceY))<2}8M5CfAbZrws@t$nmP}-1be%ao4LfhOc+zUH z#=pfGQIa#$p7fA-m6$WdH0sOq49U)Ve~m(`KE-ps$FmWER<%qsSz|c}K*y%jpVGWb z{u0Lv8-HkBv!+x8lkWaCV(Z4*gf{`Fy1EXC$(DTWUjG1{X$tNySc{td4_iq#+P@Mp z4DGc?B;&CBY9ARs-?DN1E!(~;&E@fp^ING~*&%)o7lZC|>yPDC?mS;{cFFdwMtO>7 zXmAfv*YvGEwd5{iweZ%RZ)0(O&~657OD0=6VtJ^1M!Taq4jVWXLR#oceHt)q&ZD}qFgq)CZ zo-vMUH@}6+{qr`{&M(m2vdSTolAXiH5ddomKRX8 z6C}UBkT&tqQc&KCI*ZmKdI)cYC5g6T${U+E**hlMml{e8(X^}cGotABa&3w zSxyc=r6*s!O=;HT*%*?f>vXNI#l6l5lQBzMDJa0MD zju1{s`A2g=LU|;DYE(xnw}QR$Dw`)_SR{s3EgNN5f<-@WC_fo10 zTa!IBBy?Mwl!6iah=#---oMtaNg7!~BTx0R4o|gJh5+&V?|~ zo)UOF@+rZSJ&N1>%Z>TOQGTY97~byW4_aiOyH$aC+tm7+)UmhnpeHzI7|(H6C(4PA ztgK({m+Nl~jJ9^XyCKHo{&=L+F8s-f&kBDtQcSVw_TEGsGn}4(&q~IcP0j#6D%D0j z&g9A2Se)FH`IF_(98+PGg2Ozps^BpdYTDhy2F86WZ4r`9mOR4_2Lz6p_Z1JA8Fn%0 z&T3||xppUEBh+@L$7wG8rAX~bG(JXKu+tk+xJY4^H7(WUJ5@(=YUS>V+H5~)it^6( z+&;&a)!XD_&@!+DXPh3Dv84EB;%z)#LCKOPP7g->ew9|r3yYa2x0)tN9Gh1TyK={; z;us3c+B=<5pqxCrIJ-fiRDEM4?9?XUwXET5Wws9rg-9Y+$AxU@M@MPLR)9;YKXA6o1zZ*?gkeO60%TW0eof!0u25pjt2 zpSpyOz41e8uB4MNhVBCX|xb;8%dY0+qdE|koiV%M z5)N6iIQyp=9Ok%I@roE^zPgd_F6J`EETZ`qH8|Q?kI?Z{Z#BD3S*D6%CAuT3#S#@@ zid637AIg)ek_XDRXK$?OvFht{3P&_A5+|1ga(vFXA2Y6Y{JpF<}893t>a8(G7ea`UOyV-FT7lCFIM*6BO)#d zis83&xa1sBDJ{n*6q;wDKB0KJ9nI#UX{V*~6ZT+}2STbiAap0bGgfYv<{L>wdVz}0 z7(UT8#HqSfcL(MB-!BI~-7Adn{=KB%X>i(Q#lp!Xpxmw?v{G_h4*c}aYFWHbZ>?)G zM$Zkwh0;Xar6ZA0@>z#LoF4RZlDpj6Z3Qo_(mBn{^D1H)E!|X@+p7|O@#j4IQ?z{w zI~(`9I-J-@ynzMKS9Oq)mR;HJ=~Xp2t}mjuy|lkJa0$ScXcgCC-M4DX{oSRr#xg~3 z>DniUrAfcD8o~E4`#~SPK2SN^l73#DYST^V6PhaD$Eiw_Pir(nDCAjvg!8FgAJ|JjtaB3OEWHi z0r=II)Mke1G@E;4a`B?H_b$LLdXO>5Y|y*17Oc&-(e!OQQ=V9KnJw<_Rklk!1V%|x z2=9(K3OTG>--nR8Th9|iYD#_iu1b>F$0HqwVa;?`FukU)F16;u1UnWKWqArK{on~B zX71-7O0Q*oXRP>yz2%zRh|3#!%9mAD^j^d2GfBxOX%(+jV`*L!&@D7+?Dd^IU1|{C z5hcyDx9?@fT1#sj65HTJ$h#jHz$|gf`wvfgn?>$ZrmdmkSHY$U z3~4J|vtweMoaeXK(yL5wCAnBlyryRP1WJq?A53-Pz1v07bUhx~X3^k|d+T-#(aSRV zo5#v<#Be&EKRUH%U~OLGT%Oj>=T?~riddsD;Pq|I>A)Vi9`w{w^C42P;dVY9)Qk^! z_7|UI`>sSxx#K^?J9nsVqj{}ig4Goe@W%sL{#+0 zde6Fs(*FQYwt`!II(wL|Rb`3o*;+B2j#sXD9-Pu?Jxv`2Z5iX1R(g%a)Q4ldG4x#Rym|ZI)vjwZHF5!}OKk4-@}nx zU97U$>EFIV1Z(EXPf%N_>xz7_Y@cSko*ijtd#K;rmx|(7xW>SUTY1huAHua$N7ZMV z{?yw{CbvcVK@n}(PH;{-epT$6MvXjotE_4XJ)lsq+Q5soB8>1OE7^K|bIn`5vA&M~ z09T&g{{tLDhL3}gNthqY>Gx+Ue? zT3SPABxy4-N#`3A3O-}F_32)tb*4qNIr&9L7-<;HZ`=W=d z63}}oW(Yp-UMfssc8XBDMmy8heX1KYNF!WxQ)G|_VNK~y%}ZriUV@C%M>OCm3{BLb zsWD5mdQc&E3PU2&?G&C+rJ!m#SYx#~8bE*wF^UEwHfc9dP6aWTMum+rHdefl@f!W+ zrbdA%=W{XY+$g8B1YG|BlUn&BFEd$m^*c&VHjp}sOSo1fju_?M(j1(2TE{|L zt~1=8)uRG!UBrCN`U;6bt=Qo%ZiMOruF=|^ia|~%0<$#&C^Q36Ng!2Xcmj(B>{Zkz zBrGU?XX2oWHE9B<`6Cz<&Bcq%-+8`N9-j4L)a;*V0HjO4L+VdK{cB{bv@*TfVqLIC zV^sm|`u%^9rzmqWV=h!?UC8q8 zn+>(Hs7T|Ey#6Ees^-o_Y&bZ_1avj(x?Y{9+pUI^t+(1VK=KEdXbl6(yIH~PySATg zpwo4KhnHGy%$h{ftgNV7-a*31n~7Hc0A%vR9XU0%(pnjA&ohb~uwP$OTUMzAL@bBq zs?ys1j(3H|*%7v~H&ALXw0UxGE1Y_BS-Z2M7DrobmTW=>=C&cwAb;P5=e=?Emn%N! znC)iZ0Dp$NJwH()yHLbacTfBUev&;LWB0jlC*MOG`DrxwdI#ck_JKiPt!8 zK4ouCD=o?mPSC+Gh4F}{5T}3#dUg3yWTud#mh)eeE{xFc4I-Z{=!`-L<|bQCjLln zyS}-u$}#4Sr=MQRM~XLqHRp@{9!)NH-@hw*4;{X>B;OCU&$TtcvbX~y2@{6=at}4$ zUHCM|mwO0h@0gSvejRH#Xx6c@2^+>&`>e_RX}WD0B$>eK9v|_GLSUCj)gIM(OLDu= z9R2PN1y_$q)UDZt#+#`u#T~OGC(Bd)-iJS0^oXpP82NX1aW;Q~;r{@}r;)Wtup-uP z*WIPv{KZ9(INb7xwBe;#W@JBUY`Umc8@R_q)Qr}In*0|Q?~m zNn5>eDJNHE?TH|bzoju+{$KU6>2n`hY!~_(mlVflbDEC79;e}3IR5~*16(fH+QeaF z2r>gP1D5IE@TO?~GSO`GOHDdmCgNElc?IE(!mR7b-JJZxINSxomSDf5GhOQFhcJDF^;7)%LP(Fv@ z_nXaoeSR_@AZ95PAEiet=)>OVI_F$V3#oNYKT)%IgdTL36BhEBv4RslSO7Vq{{Y3h zWKmjbI(4*3szm;5vw{1SB|hN6`G5`2PL-Q=;SUd7gjGIP@gfuH_ zca9$sa?GupyBRVu^d0K`>uyw}B!MF_^#2%mSSIrXfi@S+Q*k$h96Id%U4R{P^p-`?bYl!w5& zfJR##b4H!m{H3k{?a$*$^YQHJ|lRV}CQuF>vS7u4psecyCs<)Sk-d z6_*l1W7swoIRO6vkxouc>L*e2Yq97uYjRoJC56VVBx)2U`)hE!Hyo*9*!374t8Y=b zvbh3FD`_6$_h>f|#g>Ubgt6#3{BfG|EknjOx|XRXp&57?zU{r)4+uMUDZuJRPCe_J zI*z4hD_dLMyj$bi#T}Gw>x20B;-5QPY!axiqCNKPd8<4NaU4%?ADQ+;5D5HrIrsFc zmUCQRq!!OKD|Bt1Mc4@hWG+bP2;<(op}bFVZ=|M^9@Ue}+-`HeNo5B=dB22kFg+_v zO7UcoY6nu+FE7k8os!9JaCVNE3ycDIG~HBJw2E3Diu#d~>I+2j<3>j_OMSUFIQd9l z_PU+bQaKk*Fx%W)E%of~(J4~6k$}kD?a3fxC#Mz1+<2!+cmz6itk9zp$t}c!L9>yz zHRN>ed8W;O;kNMbdwogcy4uX^IA&buh``4up1mq%QQtv1rLo>vn{>K({>#4RJ`LQS zQ#4Ty0o=V#bB;4sZM2gHqi=F0y}E!yA~2b^Dw07Ad$)c$uQs^xKAC+sn9x`Tyn%qa z0C@nYA9Q0p9y#q+a!j>AE~PLy~WI%JEe+SHo0w1 z>7-|s9u$T`-d}udUYD=4sDm;!)MHK)b z06HG^tXEShOQA0vDknnnbR-U#rL@02lcAET9Wn9bT;8%4bDFI z*V3(nlTB6FyFt&ttvC#ES;^g+Mky@^=iZuFv*1&<(%=uRA&AYzQM`6%hTR7)a(($0RSrTHxqFKg1;&8dCys-j#A-6L&=WJspi%C)+ zHFlC?P+0cK?M;s^`-vwsJBe7a5_CALX^~K;0;XQIlQvow6!ApTq%gdX_f%xmCCb$c zyMb{xm;V4PqWPQMyHkq0CT_^BG=))}{{Ys^cOPNtQt4J^dAG3ORU^9QM2vUjgMegulO;WdpeWz?=!o@V(XRb29Vk@#`> z)@Ak7w|a-}6^wSEy8->sK5x_YHQvFgS=d2fw)$?LcwXD*MS+w>*kq2U6>CV{nMvwe zNtWpy64pzD(a~3Oud`ybthL{@SVg1V$17^uc_C!-&+lM8GJn~~7^i98Cun1i!W$?p znpWGg2|ND)x)+bmsi%o!hsF`>eq4%@#PG_zF(h&H^rgOpmHd0*`+Ls}*=p9-WM5nS zw~VMmGJgvOM@xkpp|*5Ko@uF}iw=GHmlaCY?_ z>yNs)n)+00hcZjIXi?Zzoi^?4?V>xGBuOk~nK=P}txM*4 zQo2S9%I~oic`V1}0=dX$`!&tYvavSuOBq0W0rR)={cF&E9_cN8c>2W4`?#|l0Q6@+ zdH(&aX*i}eCXY9SoE+n> zaZ{{-kAC3SL#g<6Wi2(C5xE@3N6LPlhxD#;Sl%ffCq_vWe9A^E89}2}D}zY8xENE% zPu8pF_~4q-7W!a!1y!LU4zi z7Vc=c3S*6%CkB#8!>ApXzj|z?gJg6Zbg14^fS$OZ0!Yv=ATac%C5^PCm$roZ5`vUS zM>zStDpW1Ewtc7qthyb8AC}z*rUoh{(;|colUzr}a+A3JwWjTX#!F)r3M+YdVV2HG z%>YN`-@(jUuAqmG7INRtp0n#JKpMTnbJ62R{$ivrznpaRsEng?4l*dX4)b+Ve|a^e zg-;%BglG9vBsZ*%WwFs_4fnTOZQb?3rzGco(UYG3l{~V1fRv^yBN|&vDU__<7SW%t z+-Vc?sHD;KH&Q0>q-1{(hW&s)P-|v0<~tW%qK1%d`9>AJMFX-q=h1vWah=*uvjQ^h z+Xd8RRC^3@Rhz;d7&+S~h#&_9hK4yXeUEOn(yJg$=A{g$AY&)Ltrr(5%-Z;4!xpOk zUa6_1&+@a}TQd*={V9Lp9Ma{H3V4PP5wb*<@+RTvc&?9N48wNp4z#kkSn(L=pcK5q z$mDK3CoR@f{t|5s2Lv?d&R9q}rPo zsi>k&?mLR8t7=@G^d^LCQz7IIf}@RH+_U}n>+C%$Sr`St;0^^ODt7MfX?cwz#~9i= zQ-v@C-kt_4Hfn|X(+gzNk0+?0C>`@gp>Y7uuj5Y(N&uy#I|=}&3{!W3+O86>)-FvSl{NJkK%w#`~j%sbkB(xv5ss*l;)$ ztAO zUWX&MPg?Zb?}Ik+F_TQaETg1X0|);4AJkVFap7B?2gPe+c?-iG^d?w|JQ)+^UYolE$VU3xXg##p- z&G7xZ>RPSlsJufT`aCDs`^KWfxk>abRvei$+m-hTn(BNF;RC4Mo9$8}Euepr2^X1L z+<$O>rnN1ixoHwOKJ0afU$xKA*U}TU$A1W1WNLAKq@m^Qw05CzPw6d!K&)0IhNs z-X)jB%VHyd#1si`6m;`?Z6EB8Pvc!ZzGGeTHs8jb$g!&vFC(cP>Nzibh+X%M%y5Iy zRcS9F^1Qf7=c`s`qcSbK7X+vr0N6AIjf-tW+%rZG%)Rr_)!}=!6tWTr4oC3& zf06d4SjwAJ8{7veqW7w`-!70H?Sk9IaB#oQiZlOQaZzJG5}cPovqLU%U;4pe} zYb88Ir~&irfBNYXng@ucEr++YZ1MiDZ}OtS%xV?+f#@mgFP5J-T;OAh&Km4Wz&(T~ z{mB?&q?10r9KqB+ItjMsXo=F zzumue{{VgyO2fHmF678ueQK9GoG}@x{Ofi*Q&P4gJqaWE(~{Ua9kbi9&Uln_PQh{r zxK`wP=M^N3{{SdX=jQgQ+RAXFY77p+g*4jSrayBg+kT;10%tk;ETsHaO@7NTq=Q1Qg%FFkqbNC0nA0Udqm zz$qu3u=b_%FhIc0DbY#;Y-7pxpa|u5QNhn(D5pq41^)nqF9gv*&zQ+B+4ZSgF-9X# z^^$&qmI(PSdJ1x`CG#7pts6lUK`7XPdV$3_P6F_8T1j9hU}AMS?V7n3hmn?-13c}i zbB|KFWzfU>UmOgdrCoyA=eP5W*71+-$tD~A`K{IGK%HG1RN1*A+&yvml`&Kfo z1+hvJIYSs<`FVi-GJmBv%Xa&ZyZzYM*L0S;bR38+*WA$ec5#e(ktqKFWLAFPvM<^B zoPE8e${xIfPPhRG~3quwos`50Ba}tn%!+)!9V4;ew7P&hRJw9%}Lao*$ zGuKDI$5K;y+=o9?RVVRbi~d=lJ%Aaey{p*?@|P3C zaVp`jB*&tRRD;9zf5h{@)y-kQi!Pl0?l=4vk@c&|I?luBYPl-)V&;cKpc)>Wzw+)6 z`^Pmf(=_S*>&J;R+2)Nk9%I}jrFQCMTmxRV1c$J4cJ zJm~ASp~&|Xv9xNRAlv@RrF22^t&HU!XSqFtNocEv$pZ&H>s@?6Eg>$^r~|1z>l8m7 zBtL+qEs^&R;%ga2N!^+z)`-(@8oLjX9RC2il_&fp7m~&wc}(+yIjny&LD5J3^l7dc zvG-T$OW9NCh0iXgiPmnUW4?S2+4ZVdQn@P8@-KdJYeI4mM*<8iW2ISkSYt2g)9YFB=1DS6eQKPXwFMqs3s&*> ztEw?kL^jmG)(zd8tp(}LQPd|Yo?0KB_!V+T2!)RT4r>t0=1BJq--H!(qTH<~rKrOVrJ6}}G=-2UFw;f?) zvQhs4e71iI?W&HkV+N_C-q>7~5M03U$WK5Sai7+jc4Qbj{{Vt)b^Sf#w7HVN$}%Lk z0ILK3$^MwGeqRqs;mbI!gaK`18&X8&PuI|&da>aTAL#b}4AW!QHArLBE#qrxVp3#w zXqcwptCB(54t?uNUmC-xM$uVlI+f&74W;IcK^b4Goc{phwJyM6TTZduME?NYY}JOk zaU`2qebJJ8^{n$2*E|&&KPfrK<4u>%c_W4tXPQ@3RULAB{#BYN*|LCKS+B}nM1L^{ zsV2OW;+l(LVQse!G_NW8D8T;!(5u@v9Xn5o&JwKAuGVqgy}ugy-^cpN)%;Z|A zZR0=QApFhvxALtZwG(<1_+spp~z zE-<-nLC2+fJ)Ne;!wy38Q#~bOmc+Uxw0c}0Dxbo&4cD8Et_VH5)8m0eYwy%li*F9> zg~vUq0G5m#Haah_YIM;F;}NoCAJUQC)C536?&Q;>R%>T=KfG=M6u{+VL8i}lF6^Kf zPf?yLt?>QHxYE=)+*vcvrE>luv$;VW2|LKLHqeZvLZ7cfJJ&;`+rFNW#%29Jq{Y6lfxFoe(O+i{{USg{{RZ>^*srp-7TWC$KF1rex9PR!>JubXDHaKn9bpN z&)mPZZ>~i|-WJpc+-q46w3WS)*wXzCNF3vMZ&hA?#vdQtbu|>aP5NVfg~0kQf0cC` zm}8Uq(}dCi!>~VE261g=G9Q`=lj|9%#9K+r9=+?k`(*zB^VPo^KGyBStZqHf8bENW zq-W;O6tR~hCk>BU?O*KDf2^~&Z}OZa;fH5Q2wH26;wN1Al-p})?8^l+sf+SWy z?L@8pD)hb>w{<>Nq~M?L$4x#Mm#SY{x70GxWCc;;yQKrnhSE>_WBwHN@lagI6r?=C%i8oF1drvw&uAx=+%r z#{`G&+sXKYQ7&6pq}g^&e(_G&_aCKKZC_B8zG6V{mZ>ALx^slLKTLo1>L-p^Wihzg z*awV%O3iydh5OH_<8d?fE@_VL6qAuiP@bfpttd=WvI{b=k{j2(z&WC%stO zDB3-$`K}ynoQqKz znNRVb!-`#u-+7pib5a3L0+%rPNY)@bQ}zmYT3ia7jUqCllqeLuQWwPmVZ#(0dQ&%! z#*Nt&SXV;l1En`PrtcLK&J&LQf~iH2+8I71b%!`Cc|L=^0o2a(k^8t;NB50t?2#T}aWst@B$)FtmdC9+)p5;18Q#ptsphRjqjHSYy@sDs z>K-#r4N(TDrkNs-EZ_Zl%a-PjWLMJ}w%eh>Z@hg4QFD?JIw-)W7b>pp{;~f6N|R5F zvzNdt)0*8{D9cvMHpv-L4&0XO#ai;DzPz{7trWAnDu5R!^6&lcpr;r6J+rx0UBdtY z$^QWL^sbk{R~J4Vhf?ueQ9(48(3OoCgS+QnF>%#NQ^BGdyNYj>9@C<0HrgJEXQb%* z4yiOY(TLTWHHl9pe((r-}SeA^!lDP|0<0r~dc>e_Gd9xq;H^8qTk1Gpcx> z!`e9OgSU77KGc^m+v;8ywZ5BCZF1gb!$~UbhC#^4j9_&h{?$WIv`D^x4*V^S+@5^> zPsz7z{{Z9fT^-75AMl7sbvP|8?$v`dR)m<-=HmqQ{D;zjIFAfRWv6(%U(pS^+oq!w zH~M*wJ)$h+FkfKewBfqFpI^I(#NM{2)DUyqBO~Z*n%3@LPS?CStl^Ryd#16t zxNNd4a`K^vY&LpVp}~6wl`YhlcIca=GouLx-azGvIT+|T>Dq=QWy=f=D#a2^P-dAu)WIwOV>%uWvBMK z+SwSZ=LFzX#@-_i2+8-T1d-bzMt>*{Nl}c@BOh9`y1moiDdbM1k+|Ry z^{zie)~+<&W+*NnF)=9E>}$KYy0xC#RE|lCI3b<$$KhUguV}N{x1Nr!Gq^f>)XT`F zsp(e*ULP=WG0r=Rt)j?Q;i3!(>IkfF4r^a)u@Fi=ZMezm52aqPaTV(Uz|9vV@9gJk z6KsPHzpYp~4W8ZW4hx6g8BZ(v)`+??D#)L6qXIcoR+y|^#(UQ<;teBcYuj`f>oS4= z0FQr-Yu;*0Xt?8dr=@2>do9h#mwCef01sj)xLD=4QNR3(($@6BWjLAzKW3ZmkM6hk zf5Nh>Hik*&W4WdC4k{joj%f&(KQwQsN3|I=0Y2(o!yc6eO(snMSllSxP;R4j09g&5 z`J;E^Rrk;z8K7mI-%32k9S`eOgP&SoFwh0DaVqq7AB{X-%s-8f)~R`o?MIk<^Gplo z?a29-Hu|8VsEIwOsk0gK@yv&wl_E!y6$VQj0Zbm1B5~AHenq3Ewh>udEK6|;?HrQh zlidC$k*_R)%UmHo*+0s;5=j$>&I$BA0OWs>PgeT^I4AIsX8bng0NFNvFyj<(=acyT`3WmeWdbkX%2}D%5&C z`{NdJf&S6OCYN#ZKEzv>85Ev!uT}!I7el{4^1{+Tsong|Ouiqy9Gj?+{{XO`{c4VA zuA}#ip*HWkmii3S8gFL&t9W>Q*naZ*bL=xyN8!sw`|?iT@G4%?2rVXaW#2g1X{XFT z-5*x2u{0eyPt5~2`_4^Tk4)1OrJR5F#($k2-YaArlIn2$=lyIzeRKS&LP$@@+sCpR z^cdi^UL&-Of4~J;OK`)8fcgPa@hNm4vOMNHJE>dbick0oRKMAe56w63&(|is6&6cj zBPP(KbDHDsZY{pn6aN6$Q^5#-k-;0d=~q`-(=CV+MispZ z)!6(^rb@m_Mum?!+szJB=!ah8?)BBwWPc^sCzd0vQk6^WPrKa7ds5PE>FZ9lEheHNiMHFp%!CZ)r|PV8)OGJn z5hk5zQ4h}l05L3opYyGjx)9N`q3F{1x5ZYPOk}J~#ic98)KQkf^~cIR+Pb@~BGT5| zS@W$@87!?^wOR zyE(W>;9H4fkI9WW54W8a5f-e(Z z>9&Xh{hvz#S^ogqX&G`)ZU6uZpZ*Y!3^5I;+u!Q$KaW(ul5GC~z)NHFrcT8R+l?() zL$3Txy}O*9zh%XgIX?W;td~sD zquHftRu&_t?jTkELaX0uEho)!I;yG88G?+^E1UolLG8XziF`Y#>iP}ct)z>GPdCeX z3I=rfm4*i=f-&z-@aCBQAn@&uq}r@7Lvs*vklSyQBZl10#BRv+u88XPaWHh0%0bQ| zLfQV6#Q0ZNIz;-`g%p3gwZ3Vs!?rfVC(H+bY6N<2kuqtv@M*&0N#akIHj`*_#OIs< z4r`6qm;NW$LP3=iV2}6~X_4wC_UuJ8!B+*@A{k-;&l#-AFY)q7!PtLyy-Q`q8Gjk| zk$-ok+onQ(`sZ(MNJS^|YVyI)0=hpPYJb{(B>6)Hx3-Gd2K7;vKbu!Lr=0iell<$Z z3*AJs*8Usc8*K?7atdx|P(Oz!n)E2zHwStZ4nA22K*N!m;ru7z`D4)Z>56Uht4iC!1 zkx~^8C~RXLX$Z8pb8a40xO3Q=t9+AO%EEn$=%6%cep;}%@vC_QYHt1_c&fKB+}&?R zl~x?*35>V7B8D@Z)NkEX#)KjdL0tU$%fW3f=Oz_W&L5HZS4*j-y}A@t^B=Gk!QRC8 znvz<73ZF7iy#kJlP*OECncZZL;ZdX90~q})s#(0hGIr%cZmfMC*Eb4C;eKOUw^nTS z?H#@scTnHbfP0-Dm*T4lB2PZmgDCs)^ZDkr+r)Mfz|Cta$&MK@f*O zb6R>^T`j0fNL>;C0OhG&%>MxHkNueOO*R%g3EErxv$*_&X~*}0tq5&ISqe&mJ!+nr zX*56U+b5_Sw`1|GBW|0wmtZP|a&a17*~w)-aKncF>FjHpn@x%to)^D~B}0Z+LO;l@ z!Ev?1$H~q(tc`kPn&u}=76?(l1IP2E$t(W=W=PpF+}gjUCB-4qZ-*ic{{Zf=HAf@R zeJNFn{oX!@6&3<_vbvA&TS!0PH33#%yo|q5Qzhlhe{$kg{{VrSm}=Ln*5J4M;2H)| z&Ugloero>!_N=GwYi;N!I%`c>~cPH(+uG9UIOOC$F+B_Hp`MKAVgfAv?Eum1pTHq!$?tsN@!X&0Zui2msy zH4EwYsn*hLdoE}KzD_z&4LPHcKKybo)kvn~XB)o?05e4AP{(WX*KZXcnqLG43Abug9la z!;P@SKMd45(I2yH;{r2%(<6Vnm0>I{W&5_%H~YtJc8O}3;eR2DnQgZJ01hiRh$&Jz zNc4N>ITqHC{a1JM&0CK{ywA^m5PCwJYoh`sUR_7yO>3(-zVy+G&DWFIwN5=uMeyWK zGdba|H5-v7Ii~i~gz#c2#pCa84xu!|tOV$S-TKxqal^E2bbpc^IH@7YsmJ1x+D6@D)VqJRv*0otrb;af5H=FLNuB9do0cjBwK zLhOfeJ(N`;;(c0o0b(krsWsfMhV<0zO-i~xn;%F)`qe7blD8=?620zmvD{u>WlWCQ z;wD5dRR*+XdOWcYt*&RM2b zWB$!h)VwEera&Z#hW2f_+g*F>x?EtF)NQ}-yw*mou4wmmPYU10&PdyUdgg^vqh#7A zAN`+qziPtA=rdCP0AyaQEi=ICif~Te{*~2env6ELQJ0c3Ivtyw2pth*OlT?AkBzyb)TI$*C?-Do?8&DQgw|?w=xEQSH?c&zH(SLEhq?c8js{NAl5js7 z%!+BD03mQdGQ)Pf;4gL~7`$MtYoXAf9Tc zLJ4Vmgl5|)N#b35P>$*2ZBBTtv*)xsVnq7kNv#VjtB9>t$M581+rC!khU=Vj=~ZuU zW`kDNE?!L9*~{giUOfA$V_44P{8#~+rnUJkAz$vTWm+mJSrrF+lSy@dD8o*Xuloz1 z=Unyj3qKRt+|47jcaXFaIsWqUy*;aWm{u=|V`+3(nlMb3uD1)1ZX5pq9VW0eI}ZRg zo#%;4#|iz_t4<3h-naw#_pYm2@a3)j#9}*cH;98pCo7YVN8?_9;B>bB+=?4J&9cpH zB!xHR?Nkl4f%LCKx4*QvmRp;Ce zMcggp^2qj4{uP}h=#swnm&}m{!d$G${+ZOE=090OQax+bw; z?DoXmLVj$i=DC~H(Jc>9Q8mC`wPg~#y9IBMU6#0c?* z;7gvM8dmczW;>*i{{RuL#uyP~Ynj*Q1A+a~ z=~*|8`>8G@`~@N0&$5I=6Jzf&=~h9{Tz(ZAp^7wI{M!LPQ_VPM9jZwy>JbZzPc4a6 z-ldOlGtMe6w%x-VG1;^EVlb$Ks`JvcW45xn+_$sK0pnvK>+8YmS8XhG`)qxVZ^SJd9Bp^oCr;IM^=o^E+GLUwoPO|e`HF~1D;5(b^*=TwKuGQV zD#SW+P06=a8RrA7P>0&2wy|8?OOmR8c5%ZV*gg5IshE}nKX=pDt!iqrEJ~KhJh zfp6~qM?alm7Hph{{Z#Sde8y89R@y?B3#UeDKv}pb5US*G)lV!wbB7(I1EJCFkcku0=*aYDZr`WOVXtx_X=>ZvOyQs_)`Jzv``!eQKq@ zhHgTDa-y%Nh17qom=A2!^3q$erzTLi@h#@k!wc*yGEHk#xIe{RQh0vNKJ(_RCY@rK z#969SryZhCET2=3iSMRg-Wdt>6*^em$QOU09ado=g{^Ao^w^B^C%8_m@D^5i}x(Dl13u_pM5CiyDEBLCx56J}X zao-D7U&ZI7b}|0|z}%XZPH&)dMIB1d8e?U+6+fRlsMvzIi9A=RF!>f3zuk;~TB|RJ zH8}y@dmtWw?W+2;gW4m|huV&JuwSng1hh1Z~iiB3khc^0} zGu+?LpE~weC)K#BVOw|1RQ+nSjB(VGYBIQ78qrfkIcjNPNXA$G`lVEeE;y{fD%HPV zY8-4@k!%F%h5V^i0B_+RK~#R$^Gp^qwK{v6(1{si1<5RZX>!-yA?6+lnQ zJ!+daWZDMi&&ss<&eagBXZz>*(rWq*Hg9-x)av@-zL`O{wc1s2#~*QjOxIU!`|lPr znIu((12)FS@0EU2?UP*Yo8lc0N7Ey=Tfr=t-zkbVIb+wi;rLY-@jbNH7c;@APiiD3 z83nOtb=sroIv(}cMI`LbTCKCF8id!LABy6`LtD=_>5*f&M!7L8aH{Ue$0wdmRPf3r z&4OlVBNlp;H+LRmgv2O^pZ2-qisx;-Ngb85TH0J|4+cr{)s_S~7(4>Ts(rRcEB1X7 z91eF}Jd%E&D6YS2PASD0@=3d!H<$2D;pB)+qG>frt{D&B3RG=4{Qm$d(~|!H?Yh;i z&ApxGp=YN-vfIrKx>*My+pqC+jPw=BoljJU{$DKYNBuF1G$ZlBHAZ`{wefeWUESyS zLpu-Gic-Z!)`|s9_d8b9mfuNRJx!|yk(8Ol*yk)`A0wS9M^yG7gczCBdxu)rMZnQ zS|*og%%(yUgZ}^mt{cM=KZO1#wwh-0?QaBO-B)O3WFc4%qYaT=2gK{kSpFJa3q_bE z)%*}|kn;OUbAj9L)h${B!T$h=j-#g)dr7zZUbACu*Uh+%fAQf|N9WqN{1U4tjr3sS zDlT`Q!C2P)2tC8YWJhm*ZVwB&fcC8`8AI(|xftvCRb4hAZLQ9nY?2ac)|$Ym$3QET z)RyN-w7LHPSqh)Rsei&drZFSP+TPW=bI&yfc9n(dqG;D`Bl*&*uBm7#rfU7B!rfKgY33vX06@9?>9Y8d zBO5KD5CigwU*_hpq-iI8f;kS}n?`E6Q7VvEn-KnO(~^&Pse^mc%%n3xjgz(F~(>CbAjCcG?@pz zCRFvM+Bh@-7U!)Q=Zq6b2Pdv*?b3i0oKc_u097GBg&{m?KnErJG__86N zD#3~W0B%fwD&{u<(P<2i*+o*L5qm~-PvX17{;t*p_aK^q{70(E{{SsgG5*4VSbN=@ zsi{&4U+$x~;Z^LQh0i8llU2Wze)94|{{RBTRB6i`yR#p2P8Ktss2Ba?ig>U;#VQY{ zG+HQEkq4NgwrQ;~9Tz6AEsQ{X(z*IBDZR&DhWRblQOByaT%j+Z$b7wqA zP6Ik|pRH*fh;3Z122EX!c1BHBfmFyh994-}2dm+8-_uere?SQGdky3N-@I)4V+Z&e`XGTm{Wax4LDn zKF_B_7<4kW<6TX^gYFl)@`q9JXqPf zMt_ZB5wHF9YhvHQSC@EwS5d}!*&bZ}XNv0n(hq&dr#-3uZv!Xrqy;Mv4Cy*^D@SeS z%Do4I59wM!M>*2E4+S>g80oL*?T`E9isy&vTiyA`_^+`$g@KPL2~R&js73Y@U1i+O}5C`hhfWl4(6)iVhf2DS&Om##2}xktRT-k z&m@ODry$~+?rgETpEjr1Am-Rk?lY{@lE$4fE|E9 z;**SXpK2*EJ!k+co}Ig9jlQ&llR;xZ3RkroPANI;XawUO=mEU({xqNtt@=}oWKwSS z=71ZIP7NkV%{w_56s2?0fEkv_{&aL{YaK5lH2)c$njiO9$Who~9FbTU~-82Mt3Jb!wsy~&4zC@bi-im82# zxjU5Mv655{fNeb(kTX_SO#o%&j2}Uks2cJ(rQZ}~bNAHrKjBi1NnZt3RddNbMGZ;p zG|?ckh;-iHhT2hLI{82kR;cbIzKVGveWPKRo zOjAwNK;X@vYKk2O{_Yf+;__i)0=$il+5YJ7T0_E-DdDCpI^|V=sWobq_hh=ZGoJ1M zbAo=S6vL>X`>HEccucb~1=tSdx(~oo{{X^2V-g=W<~3gVfAOd6qDQtcn^L$xbrnas zrv0{ZK6x;G7Pa0D65L@#*DdUDKRRf80)BYJYySW|;EybRtl!X8av=7M%U7CF#^c{6 zm2EDIhlws7heNaFt;E%BZN}SeKWWA?&B>%~YVE${FsJ6!)KNU_&X&UM<#vX^LX2_- zbC1l@{{UoPu+GjsjazL-{mwl7zQ~|c8||`!UP;^Bv8Q0>WjE69kY(2&jYQULzj>5? zHEod>i6o34c?B?=WgqLZeGv(w!OT@Q*#0s2Qh7)L8FQYXcd3oVxc(X6=m7r!8f2Gn z#76r^=d;LjkH}IBl-zl9e278EQ(6!kSxMOu4l#iIuj&5)>iSd${7eW-cq2@9O~igA z)kAY21nzyO_(N?Ukf3^<7`%H}L3@L58R(IyQr|@!)qk|vY7-D5R8daq0A&)P{s-s(0M$UObefNYwH-Z-s_{(;I7t+h4eWFGTkBYJ zyGOYlO1kh=j1X(nJTc;pCgVr7urGHereg9w%x%kwJr4ebagL)rRv*L#wbSnZ0J5xn zxfsN<%0b<;JrEwC_aOA*vwBL!#k(ASpZ>Ykz-5&iIqiT*srTW3+HSR_Hj!(l>QTot z+T8P_R998&kq)>2bK=kC)L0^>QpTevy-sSn5mR9oF=> z(=N3wHbp4bcN>r}{{WV3GN=CeWv@_6ow#GlgU_iQ>(Bg2JahQk@kjw=jZlU=j!O4E z`u>#DvntwLb4e6iP_StmXvpk6 z>vyb3N4ePQklbl9v~9rwmZQ`4fR_=xj58a$6ON!YavCJE+g!$tEx3YaJK9Ah%!KVy zK2$(B<(t1=p0&jvA4HmK+FVcexYA`YL%(?1ROj+Nz3T)15}jvKmOBkKK4#%#QMlvz z04ikU_BDcquO@dhUh4O;xB6X#GOx?ZH$Sgxhf9}RxM|{>Ty}++2WZ#y4cfA-JT++) z0{30jE-qLcy}VL+Y5wUwPd`fNtsKb`$!VuOrILTEGOp0yMeC1qSW0iBI->~nMzj|2 z#7sV6bFeTjdh^Y1_yjbV1O#vIHJIX^E?p{OtK zEcWg)ku-dys&=v-#2;!+;N*114RgAuk$1RAH-;Gkv+VVgs0SWV zk<@(BBAt!+`ukMN3jDjq8_-u;cSoM|b|Oe~7>PW=8l3XqDEd{nfRBX9&vVT?_;4|b zGmZ`i(t!%hbA!ey!bI3XQ|4!a8;{DP1Oq`Fb~&I4GCgQ5?~m(D+(O`t3PaPANC`8Z zgbviZvT7no$MdBpAJTv%?V16ow;iUCzA-=(XCF+_l6#EQd#Kz808j*k@$XH@9O9e1 zsn6j{zvbl61BoD<(VA{~^`$JNU=F>-3UomlMjnwv{aY0(+t^3e+F1|%HMrV8B_Gz8$eL69ZTXsU zv~=c8k^ca`!GE0^3haVWYR#2}W6x;W&+1JexwnA`TV3b?0+2l&t-_P3HvEAE0twH+f_2w2_*|&cxX}PmrhDZ7)Kb>^3_&UlQ`O`?m zb{JphSK-j~dsZK~jnDgMCY90)Gm008c#75A>i+=XD%2LbwbNqW!AGHF{{Z5RcPTZ6 zs0$o%r?RdpH`O3qj=sge&+?;~XFDH7p57agx+i7A$T6NV>_MXJ31aYuiEWMS@*h(F z0MKHt$8Vt7sZ_PMDtdz$KhC)6wRv|CGC8W!T~5WmZoyx zokH75n?6iJJoA|3Vf^wdG?!zY#GiUV4}O&F1TxwAAzdU_h@MGY75d_(lf(Lj#4RT?lE0Ahv6E=xpjGeV?L=og^KNB0?J4`-vIO-2dVa~g_QX# zHPk`N@FY{U&*T0z(L>`MM@-VKEiGCr8z7iPdtmJ+KQk7|Je-qx#bKhp{plX ziuOTXxS%nJpNehPG|| zMWx-4HH{S_l=ZvPw!+_5g}-*+@Ew)0X{1_eNB$(bmA0Vdf2hZka6j#n{qOhzYAzNz znC#9ys@!&lw}zVOBSz@x>UOd^?I8URH9A;h18L1oz{iTvsTf@IR-&}W@bS{MW3a&W zt4n4F1a+e0v6mI81A&}X$f9xIn!dK`J-Mq1+ZgO){GDsql;$(&tFC(3*2YT#u%bQOO*q0f5*^b8fer7-1 zUu@SWZ7W;cq|iuGSk6?Dky;R6!*4v7H(qR)oEVr7ln(f;=9@O8;=N9~*7(UVD0oC=e2SdS99O7l2e2qbq7VQO(tZp z(&V{0-<*&3k3}EWq0_dnb6;xY?a=A<4-i`DviVR68S%nyE%=gu$*fy%5!~ErwzAHs zVOf=)Ai3PR`g_A3`t4Hh=(Ml|UHB z;Z0M??e(fUP0L(=oW_^e387z;TX!X`9agj%nG!$>?YS zM^1mjlo6B2>FH7K#_oseX?V{a=mFpZfImu2-Oo|m6x;*fib64xInOzu2{|AU$EUp= z{{Z^^X`WysxyS2BSOnx88US`dIRg}&1|z*R7RMxeP#6%v9z_5-M&NLy_c#;+IqUC= zW^w9&tv|Lul>jbt!Nnjw$Kgl_>BSAkarseX0rO`xD`XF9Zd-RejB!CY&P6aLSI<$5_oUpp>-f||0QBog#(J6noJoU`jL}f;CmaEe^imn( z5Z9GtnIX?X7^QffOBVm=3^8Wxhs;{g7*xkp{mDn3`+1Mc%w2zMD9RC1A zSK(VAcuR171b8FyHOD@!Cf}F0<@r^2)+af`WFDaNMS%20yta=xo1AhK?fk&1lWOor z3G)!B8)fAK@S@0Nhrf}#;C@-E?{xXW93SDS9^*|-!R9AF!aCF~s@S3Z)ufI` zcQ~Wu1GknSkJh_gH^jPa--a!89dkjN^({#9E?~K3DgYp5MmhtI4JzXL#K%4Srllz*xYS$Ny=ws5 z`{p4J3|HpIz712J!bs<2-xQ}jG;NMPp1#$;XW_fa2op$DcN^9}@io<1>2|j1DXO+H5Zts%^8 zeV?Ut_g)*cXm=vCNKXY%_!AU z*d%1az-JvQY3X=8LIXI%n>}-)C@r8X0#- zpJipJu29{o&39(+7fA$h?J9kCADMmdO8qND*%n7Xr+Cj-ZnSy4^d)ULMrQStgfCxIe>qcg3G+RB`_R0$;|lZ#*CwM3!=}4akmU+E3Hgx$B)K z>dD+s1EC%E@l|wBDeI=d#{x`L7$p0X#_I96`>v#^{OQ8fe-Ih<735lGrK;#tG`dyv z)~w%oV{eiF0Nz9Lx8f_a(|#mE_={W9Jh#XEeQDzy`m`)@{{VcE^rO%$&b1-=$=${} z98{t&r?nb&#g?rBpG&=l{v{c2C-2Ap^k?Q@%O3S)1vuNupq`{y`A$J3@zT7@##f$a ziY_D1%Wo1yJ&rPeQ(mYP7ssiu4c2b{+u{vM1_94XvWi#ABk`)?uyKl>8Me4X)3pFLEW4%OjGIs;bIdY&6 zS^$%$az!9KbsvpKocfdAmz-vR9lQ@iMKqPiIOEca0P$b8&g1uzHx$R8dgni#R{iTP zIRNybVUXtsA5&HX-N68k8lq=zl^}#E`JeC{)AmLVLY&YRB8^BH&*4%mWruvyw7W+I zR+wi8tFihD0F)~)9Ac%4$jQh_8~xS5{LNJRS|@k>@l0#*PfU+;O)G(-_OZkF+9=2S za%BFMADwK6_hqGT`ehIGHJopN94fETK&aaJk5XAeR*S0l#%&z#Cp<*3!E-k z`ck;;O4G!r+wfXLayFI`?gPJH=TLlrwj@=@sHKwPKO_j$?qTx`@tSr4P1J`Za6LLw zq>j(`@$M?ByB5aW=e17=ecY%6)EWRuT@iA?%1OWs)Z$@yQbrzDaz0hhH7&$(+8E2C z%ouJUgMrOTF0}d3EY`&_(_`Y5fikQyGZ6ty);R1rVD3^He9m;ai8&``dN>uLPC$IIJJbVQnpeQ`HGKkTGF!6 zH9>HX_7`LwHwHLn@RC$FtUr)6Ynp?yv=@55S)W>vL{ z7tV6C?#cYP`qcKf7Zw+S&rvE#3I6~sK5REbxofzE_i%Wrf<#v)D9Js_iiag7W>Skz z=%OJ@J6SF#ybBB~mUhPjr#KwdnEX$nLkjJ>60lr@9Bjn?Bz3G$7G5-39nG?@Hw&{q zoP+%<&n>QGf^}kBXgscXHJmWfd$OliOJmvp0O29hzRGSRxekZ#Mn}#)w%3+;vhwos z>e$_^NJK@^Vi*BS9ChclShAGrdcCA@#z7CcvCt2HYN_H_E@0HqUfY;mVgM=Ua`VVN z`qq@_s7pd8QO-J%_-f$AcPu23vL;8=S5&@nVX#QYw#6o`cuT;#eAcjDYBy7xmQ48( zKE*(K0(z7FHPu~sO7$R%`x5fr+C^em$nqC;$K52cTz(a|Ja225NlT%JYLYjGZ=#hZ zibyv)Z&JN5!}``OzN;i8Zwo9>{1+|qk?eaK?jz8=P`@nVA`e%Lh84G{_!CL-6{%?; zxYO?uaF1^xG5-MHw&4DhO~at6xxLZk_wZ>KAtwQ>i?-5aU;^i-&G%S)ySZ)E?2Jrsp`->ofIrwE{pGC>scm^&RGoBw+nG?EU?cR%F7#-bw0#$E3wkPBkB%2xs1wN%YJ3gX;EaZ=d;ovnE&%`^0?7_=?WG@SVcL_Ny05%Vrr@h=$MNI5k|| z#Nv8BnXhSj(wQ{t!8@w%HqN^zUOCDhzwjK_huvi^7J?PJL(s1v_d~kROzsRP8F~ z}eKqz9%xAmv?BmI&ov7^V5#2nSDD8A8m3`48Bujas08$GVge49IX zum;GHagRk6CJ7k_H6y557-RqsYHrN1Al70dXA$EhfIIq9mREDMgX>P|>IEhN$vjX4 zOL6q4hU1a>_NIkUq@D&kifX!mFhTltpktc^W{_ZWjE}7}mt&Pt`1Yf7XCM$w0v9+@ zf_G=VG_YQ+){yoEx>K+h8+v=t19yNj6t~iWD+0r3Jt@F(yE!M)g1N}!J!k>Dn339) zWS_i#f|G9m3>pZ=aH9vdXaF38oRRN9;B)%ZNj(Nn^rS4sPESweKoag4+%cb}C}Z^Z z%|o6&F-We*xgSbkOmXTlN)8pVh7|#Sy`GeAJ9ruPpb<7tBLkX1$8JdOd4uhJjF7%M2FeG|Y3~Wa^H10bVP2tB} zeJUe7kI)lU*+4uBmN|;`85G1WW+wxRUnS4qd}r2~_MOj>Pu+7lU8%coObSd8@3Rg* zbw5f`2q&&;RkFP&ED|sJ>gu%{tg$`>&`5vjn{NJ;mc2^baYejiwJD7kwMK5)aFQ;@ z^&|P!rJO8(JZtlGBy;)nG@2JOARMm)rAUr^wr5?a7y`ZOIa#De+WAw*$$|d>>Z-re~Z3NI|;K_`+i@t{g_7(7~0ZfmLFeUohp*a=Uf>S7SA5FMG@tf9f9W+Vn;DW zwlRUwnyU)s$5@|gOwr&R;J6>iQhAUoY+euh3LdirQc8YKdHkzN1^YUHi_fpE3<>A8 zkH|+GorZTfOdNeWn$PnU18pyi@WD-LM|A3WKaDz7OQ`n09)wY_8Dmk}1{Wj&oz%wE z7_r1rkD#Qv)8v5e&g4JDqxt$(P!M+}gd;-U@>yGMSReL4t0G&5Z}swL-cRtXfZKlR z{d|8MXFBXPcCLJ5G8ki688jAt!Q^TRB=e< z-6SM$+IHl7)_sQ;Bew>SBciqODODTbM)wRstr+}I9sn^o9e5!B0G^e?p(&1PRG8E_ z8Y!NM2Z(Ilerb$n(bUz|*X+Op?U7=<(PlXN)ih5evII^^rN};G&@Qj+FC*C{-KzuY z!m?!3EZ7E1n?vdIio|Fw;RDaQ8@qrh8Lg5Jv_Oq84YY?N1Kd$zHfmdIcKTkcutH-+ zVxwWtoyG?~ndX_|tIMr+>etG%`5=sq%Gunj z+Bk80Z*XKCJgvC>OS0#(Yg>+ss5EL_qP|*+b*QWaHnCLX!Ds5 z8%zHH7CUavE3I9BP1M+0_=?Re?mp_>E_}bizVdwzMO4-NCvhAtHJMwenKz+K6OPA@ z8~y`HNvCNP;TbEVoRdUp<9W2HzTXNB&l;}Vh===e_5T2YE=6PHjzP_9v`k(*sEecU z>gYW9DXWsi+X4{uim2 z1E8Q{k&(~=TK*S^P2x)gJv{Yq%b)3411GL)r0`5eUliKo_!dX}m;V5-K-ubBh{4Sp zj|6j$wEdv;=9%)a8%Y%+4Z)90eJQF~kC}%#;M0bBdUv6K86D|Z2ID-Ql;WcSvF(vd zx2`#)Z8#H>ODE>qWp!a#`_6 zMsxFiU8zaQ>Cb9r^Y20dt&#GB)4d;b{duQuC2__xO*q?}gqi)StDrmi)JkBO#+EPr|Vsf*uS*%*vdw`XrMkUuJSnRPP-1myM{RMJDU2=2^#kySf{*}w>CrbNdQD~7(_J1uzhEcUzPvT?;m0$OXG=SZYN&eLE-P_t~$#-=Hx!94m zA86#LUZdsRk9x7H=)NAb(Hj22<+&h6(J9?5a+V_q+q<0O<>Xf*5D`coKatm?rv!c? ztytViV|NAAE>dfGJlKxV!5h>m<25Z7q>jqt;jWEzry5t;o2Fu&fgjSj+y4NBQbU)w z)dCKoZX+N15I@qkm&cm3L*>imw>-Kg#>cmB(zPsnPolc+o-2svJz7ELXZ{6F?Gkxi z-+??qYS@m??Q8w(X5IQ68jiSGGm2+TKVavWXqN+Z5}R zA$Huq1b?avfPT59&)h70=+eB7T%49U&swzvyW@1{J8@pMd*S~83(2^?lV}SbN&=@p zfv#s#@J*2hT^{C0q{sTHn|hyMdXLtcl43a6t}%si_|>Gt41=1habaimQ$p_%$}t|kl}0k`c;UC+%qk0^-=0}{{RowYT$MQXgD3lY0=E_u=eLZlr?lUCblVfc zYF#T{)pTi=&SYz=eiU2DCQ9f2dB6V3c&sM|ozc7}rJ0LwCC;C7_KP{539+=5=)=*6 zMjpnjd3tWC$L@5i>|))tfN`H})kaK-MH!cNcY1x>TV6u&Ortv-bzkoe%ss~x)gu+Um+)Iz#b0M8Nu-+v_Kl8$0{RCGmLB(}H50p2FTcKsOlK`J}6xmGm00=)R!R%>H2_5l9K44Bi z3PO0>jyn6)hz18BbJx~}B>bEnbM>SI^*#GijsfI@Kn*AqoE%Y)n>|OpIItm@4ZDX` z6s*O6$Gr#yDgaV?A6jWvP%p&5cw~YL!wmZ?Z{{RjuLKI_?7ohsnUm0F7eT^U*TO*tt9%;RB zLgaeW%xuYpBoq15a8IaeU=9cH?f~;lJSit1jXj2NH+?91NE%{?JRiNoOLAeQ3zGRS3G?)Q5)@JSzEbU3`z*{ZP2La=JyoXET&QlZ8$&0#ZGN53uMiK>%4PX84#5Stbc@zFzdx*7uv-;DaCHR zwEK3F9Zv^g8LT_YE1Px68?sN`1pYKF#4)ZE8U@0dqDcpywB%*omKhl7nxHRvpxFQlt_>5Szy!A&g^D@G{E$u82Z*sw+6g9VlYS?Q}^egpa~W6)}&bzy;LlUcRL(<4)mh!Uol8beA?w)3RKdkN64e_*N&0t|4s@XC7gdc6#T(>0Gs?^pMCtYFwpU^gT;Y zi&nJ_6DyWdBbU?Ij@IP;WT!eplM+fm=3^Gb#*O+W0H5vVOICohh?iiaeHbCt5Zn$msI zx69p6LtRCUI%wE^%A3lp8Y+3gx)^)2*PI@^3EM7L~i6!|H3Ihr_nwP`*{& z^ZEcGkMSn8#+Nh*Jn8MN;ZgEhTQ^hw(+bVjzQw9m_>VKxu9h2iNyLkJZ96BI$H>UT zl6&VTrFV8(PL+2hwb8Y<^CDD{4AL-d-+Sc*eB9LyU&B&sn#H`3*+mQnA0|+e7Rzqo zxd1vr#l9T&>s^E}4L--pyShTrjAW7k1CBs7l}eFqE9z+&HLZ#%q-m$kbcRb=3uTOQ zvVwZ&X*m5Vj-%n*T|xkt7gq%pw=9L1AmbjK)h$=XUK_Ax#P4@7JR~15&;EixTF95g z9xc~U{{UraiKki~w^s}|WBvBvf529njNif@?@`*XgY7IV<}+%k0+s#T_d8c_{0BL2 z!ny4m#CHA>)s^jU{{XeEr(Z7q+*`D9%O9XDd121h+sC*jw{83vV?5&K{#&h1az1ip zK3?zlm;!5Be+X%kS&Mhmtl(DrzC?e#&;4{f{#CSeR(hDu%;r2@pj&Bk!+)ho=V;gJ zCAN@%2qQax`rU|O+NpR#^jUbW!N|%;<`(||fsMb<73f+og<#Y)txLlfl1m)0Ad=I} z`-*!=o}V`iKiaP*xrTci*NR0k+}p@im;0`#{6$*Wo`=1R1KD3}Q&g1P#xMLotprr%`<-5=WN}P1)DWn$V zP5||#kb%Ov&p(YW(SkAvIQmcoe>Y5Z=bkA*;|xAutvhgDH~@8S#+17;*rbuvjAoDk zKPco3d-bM{nd(g`&NA4~wKr}#Ew{f1tuO+o1oMxr938xIP6LmHIyC6n-=MO&RN8?^b9tM+l$urD;p?74BeKAXo+{*pO;vLntcF zp>fE5hMLe07_KS%0=^ZMhC5dE{{V-)&C=L%!;=^P01|(tbIh+Blwehtf(XGMT3|^^ zr)kf%MdniP1y8L2CUVs6tCjlHXRSFt^#EK+Z~&>+VTz$ZBvD{AOQf7K=BJBdt`K9| zvX(+BcOIG; zD}e-?0R}R_bjAiMV>5{OKbseY3tDLUye0LQrX&io`e%JRq zi6S-O7tnhCHB+T}Gsj^an_Nw%Tf9oc%*-p5Q^8_SK9!q4hV1V!V1=!il^KnTE(zhV z-9P%&l#y05v_~h5aa#T$CSMG`;q%1k-;vw=nxWJ5Y3&(`6Fh1G$X%>6>a}xPjDNy6 zd&e6$XZ{2!bs51f3vD#A#PK#Aahhsv^He(!ndFYur*;+ew0=tfyqeIHd&AxjyVUF< zy0N;kmPrQYhG_~i`*g0t+rb*^TFDj2(jtkMYi_zPk@Og^Jkm99v8qQTPb7F4*m=CC>j5FuBARDh$RSOrxHtn93?3%j zn=iEZE?3lWYPPjp_fhn07oNd$59Wo1>2d-*DUvw$j5Z|JBb|d0{;N|sYZ?8yu|SdX%fBH zjlW@ywz-ASY({)%73h)L=`lbf*&Wm12A6K|$LswCVQVny;?8xn zg~Z$ych3!x(===aB)fSETuj-xiNsASedkla>HPkc=RXoGMZbxD+Zy6K)!u*KUK{fp zO?R!Q%JRA+1{*dEb=}wVCbGUD*$dAM+um*c<+~|!>z|bW05SPfi?b~)2|N>YZu}`` zr z6;~m%oYDeYAe>{SJ5!D`xVBCy0Ntfn9^U;Z7zZHk>uG<9Kv^~F1YyLXbuu2-5x;GaqWFaYh7 z=}5=W06(29gE&0#$fO&8iFx**2B0g*7&N&or?|%ypeXr)8R^oOsOJFC1VH1QeeOC? z>6Q(WI}uO8EsSH2#)kvtUf)^(VZdIvpbigAH=w4J!i@CI46VZgq%TD$0)Q2Pj2@$& z!iqpna^E*JPzRBiiqi^K=BJ}xX;b+Z*5(&HD`zPu`>p*t)?|?t;~2#>@2V1aC`)dx z5Ae6GR*@PdB#xD)P}F5ZF4kO%p_T3K3g92Rj%z|ZN-~u~TXsN7ay>iLmSm%j)PT_P z{SIjfAo2XFK-e@r{4lPr9DuDh~&VM)pSD>0O-6=8x72s8mOrv=IDRl^b9q*r`;&@)ce0cZ!S8tpWjDK8Rb zh(;PGJAvTi8Lk@EawLS9*a-8SbDHV!NqjS^NTYhIU^lU;lDH;qTwm#0V+BS2(zsrs z8*D1xh5rDH{sUMynuX4xclMi>Ld0zg8UTi3SQl0Bh-Czjw;%A1!xe+r|YHt<`*{{ZPdFh()&%>X{; zJFAk=*uW(*u=7})f51g(J^jN-@ai#}S4nmcHY^tG0AuOT1FmUwe-FKl!;w6vf8ITp zxjyH&<63&wys_U$r8xUcKkBCrfz@SU@~#h{Kb=D;fn<23p2}Y^BY(%uwZw`KmQN2G z0m}NRtV^#D-hH%NM5YLklstUIbsc}tHB(d7WsVyQdodC!lFQ}DTrkc6JHH%?u)3qi z3dbA~#KZ{FB^!eJ@^VinfPbYH8KdFtZu;q-dz1}o>{HBtl!qYWAKv3XO5MNF&a0wb zrK6w{qbxx8Aou*MnYy0hBDOZR6KZoFByz0Lt8^N?E8-1o0X1FlzV->ZoIas3#2=vLWg40i1c%+8hH<2SNlnJ}$QGmd=P7P^6qg^v+$zrFcnA!gT z3YP9>>Z6%$9@_TkPOi}`il^+rZjoCYujgBz*|SBbwX_QMO|*q@z_=&)w)2zq#dLPM zREWD?K^A(18)|F2{SQF5GT&X>zyRdR;b>imb8lmo)BG(uNF|c${^mkH{msG0{qVR^ zTWO(38fHg1JwW+umDWBp=$6Hz(x9IC$3q(IPy6Y=!nq5th@LIhPTLqGvsUOLk9+?B z-%b7%T#S~VQ`Q>F&ek@XM+l3~NeVwY&9%R`)%49lY;G1sg889I*9Ur(+>N`w7&Z^J zd1PJ=@h^y_66#ksGZKI0;mdxVE4c8Ek$2(QbzLU!Zs)a%%HOCpi+5{uj^11@(LUmjkP!Vp!msIUi)(WPgCYy3N#&h{~4_U{w`ESr``Uz}&q{uQEn9Wm8h)hm-}k+Iet`VmDNW=RQ|mwJq;nb$^0#|n*H^4g`*o|fzJLjlEbZcvuT&G zt8TNAln~iZoPL!L!>uKBb=J)DL~bsxRG~5qyjyo6{7aw2S8&OUFbCVUHuWQOfst_h zms8NG{{RXC?EAO}^5VOxA&y`5h_Vm%)HP9?MzNHW9ZosOIQ;3Lsm{Q0pIUIrlb@R>xas**a9Erv&(fF+v!_rRnew+)Pg_%D1{{SDMuO#sogYUE%3r!!HZJs7U^8P-CwT%0gq;c#& zchj0;LQ5-*1L}HJ=_B&;sxoS`Fj3DYtfBKsFPFA4xxK27*sjS8=okIw6?e{a_l_2j zvi|^kGyz#`3w`;QJ$_MBEv>p|jmO@l4l~eW^QVVt8wnpi!!!XCZ)W)bQSD94W!Cu zvqc%~!#}TD!MU_lM)XCWMP3d!3Y=d z>S|o#w_{L~*z>ZuG^y6RAkLgD;B+vWQ?YLalg6T0S%$lu$ z_1NtH0Lg0YwvS;n(Zbr3w)23=4$=9sqnQDWt%1O-tzyv1i*p$L>|F8Hmb&(~)2%0x zGivT*`A?rSWS^qqwDgOJn&8B?cGk-vDIBR6D){+HW1cEp`iSN{GarcN)708qTu9)P z8oWmy-F@qB^TU_75Tu&D)up6}>p4=(zj*7@md{Go{>;}ehSr)=8@tFy=~gW~D=oZ& zNv>Bp=@Mu0B-C@DRqkhfC2ckC_IK36XgZJC`-p ze`e3-{heVhqm9UVkHVM!5viOU+hiZ_RC6*NL~_yGUEf|uV`(H&G>2-aWr=-xHB$3U zziGUkIo|w_^|uirSQq_~Bjb1AF~xU~cx}+}Z3KUZ_*RnW@!Ljazq^Vk!0PT0erBEg zh3#I*^5}d4adUA)9$9>)K2UBp@Pp}**YmAa(=`1mKRW*6-WBQ$tQYlewaCTd z{a)YURGuQehSKW#3-yXf-rPjKQxzL_@Bam@SAA%J}c6N#k<@BE-bEP zE*A~7vL1u!+#KK*!+sC&%5K#)D>&JIgMXO+0FbG?R-2K`x+|PS z{wwk4h@7Hb%LHxw7LkPgdj7ROo$%LK)g|A0?#|q?k&KP~Yku?Mmxgqu^0aL#TgM$9 zUy(m5<~0wAUNF|diqY1_-MHCon{Yn0UiYB={{Ta^gW>*xp+qOvFCdR-$rHFAr&@;Z z$Nmq{<0+8rHQpn|Slg&DEr0Yq8RP76d^h$=aT+!2Hci~xcDWn*nSndBckWeS1pIi%lYG~4N9TC;7s&Ej^F z-YId8>de2?Rk>q$wB(sv8>MbQ10x^fSq(6c#8!b@0!So_j`-SZa^gM71S-d9+xSEm0qw_Xm1erP zeX&ai#n(^8!0#!iJrKZsz2WkQ5o+_-`Wt{xUYY}YcA$FhA zu#;K7j4s)Wu0Q3ff@<7;B0GP5ZZ{6Q9|Q6j)l{2H0}dVe)j{~n!~T&`hk;9B}$%sITYWMBRsE6p2nCCWAfni_Y}fQWZ;47 z4I6}eNx%f)ds4CFZcsDEI?w^v<=`*10dU2ADF7#MR5;IbjMD7{g~#ha2v-L`F*H!h z3hc{$dQm{mA~{Nsb6VCKyw=Px=K{IalZvq%n8kA(Ry{*T@n$xwzvgSPhU(8$gJgji zFu_*j8t{m&N;XvDx(zSH6I!zT#}w{Towql9P4K~F^DcBxl;D$WPY3a~TUM5KR>`?^`+eQzl@))_;aSS8k*b7Q&cosP?;(4( zyS4JiEYXH?ew`}J9vrrDl647q&&vWv59gZJoHv@&HPy5w*Kh92Kg;wK({zD$&#FjM z(TG9%n#-O|ZfVILg=N!p%M>1dn>4P+3dZ2JIR5I^lwaAl$g(2oagw9udp|js{`x!` z&XZQ~)wbhtsN1*uCg*X_{0B8nyl>%4PSy_7KG>Vd{%8vTP9NFM!m+fEQ_%< zOSVt>=WQHz>VNNn{c9=)*Ja)(Ebw!;jGnw?;9{$vh?b5D*m!$Uxnt_efc^>%LobWG zNiQ&H7B*?v-N;Tq@1iS6)2Y~rP9}tQ7yE|klm`C$kN*H&Omqdih0=U9NADsl5O8Y& zue@KVNw{iuuxF3-J7RJ8g~#hknhvjXAZy)6QMmx}reP32ktVO%%c~>WF2?+tb)uFs zUTi#bBZ2(toVOYcnJoIvsA2p+Xg~WXssq7MNh3V51CyPu&n`a`iob8+I3gQS6lbz3 zuks((twk=RH*z?e8B>2qWT*`t{oYUl3B!RH^!yX_-PxzvP-rOT(=`Df-t4;lLX-hVn$OJro- zjx2a##Y{smfg-Ws+@ib&8YVSvE2uD?!z8t#)qhSu#8`dA~DL6uP7 zE90)x+>=%H4-$Ac!-c-xs@X><{{WV8pE5)L00CT@n>IsoW;9+Owwh*{tnM!4Zc8x{ zFXW>&uWz7C7ZJk+s%N4uPv&cv)c!bWFosPZNw~Q<;#;Q?C;jp=-^#dMU*i{ybppw0 zb8n}{e}>KvoBsQ3n!je)a=xdze_=sy5tiz9R9;KS!AIt67XJXo{s++6mTg+nNf-Dn z^E-dzyjRYbQ)`;+yYFsqZ(R4dm7n?5%RdI{_jua@MEXaGdo$#vm=z!u!C$fUy%DRhxg?e_D zf?ZPL*404eu4YvII?>GpE_n1u0{;LJJZY)Q6}6kF(YsqB#y>$-29K|6mqJT{b#w1L zg};YS*1sl=;kfhsGG54A{{S5r6kWZL?>DkL zUkJwfe}yhIw}GHY;Szn{l>YH|pUS!a0E!wWr=WPbymO)7NZ?MXJE7cdB~=O)hxZwA z!S(kQnW20|)^!b5DRpar+IP(ymUiePk7Et3!0K>LGhJVZ@2)f()bRj_YFahaG1%+Y zpl%YbF%`fc?IFf}w*54u`V|c{WqbW|PVslcEnecv4<_z;5^Kvy!EY{Q+qs7yb$|yy zojXX>+ez>yopZPM5bFXd*Rw+=)nWK<4fR^TrCXiXT+wC9wQtn$w*fbzE z(XqO4JvkqRV%U5?)inkWFnpfxUYp|28Tfm{n)T1ytnP0wq&wqShzqkM44-nR^R71c z;@-b_OnNSvZKt8=;dj60YZuLZ3|}_OL~QB53S)vbn)#IxM=v4TKQmV6)Vv4bak5L; z?x$b*Wo*08=U#gDo+t4vo3$(Jx%YNvAP>sA4Ikls% zY`Gk7VUttNmdISsqBu3$S^W zLAZ7Q0D-s4zom9IUk>!h%E_roajHr={{TvSr~U9gTIyi4hTbKL7?LR0qN0TZ@D*H- zs|N=6XBB7RJx=w6x{b=)FZU6=*)i$3{G;`*j>E&+ZkR-n+XkEddrEh|=6Ze=uyDTB zR%^JT>e)3)5nByGA>~I_stm z-kG~2rWp6lApi^>{{WBHps}(sL;J!2&#gRePsKMwSP_NgO3HbEqHp?2-Ui~@MT zzR$^hh_n$Fz*}AQ8nc8@2?-P5}q{QRXmxjq@mD$4vcbu!|6CH*jsm)P;Isn|B@R+}D~$gD6&3@z*mmb`SM;Xfk;ou*t}+{`t@ws%BEaYv)%5WN;xOD} zl>p}{gZYY918!Y5(sP+@<1PK*HlNa{NuXOc1ai$DI%f_jPZ39uv)kKPeadZ$0ilV~5q z&$T;&!0u&Yt&Q2v4-^SEG3lRi(z~f`;h+6%NiKiBfc~{YZ3@N5%C8as0H$xi{0>DI z1C<3>fY{u6QCo6&VX|5apv~zbe^Esj1I3l<%E`rNhN+eED=Qq9FGl0mrAfB7Gg;yI z>q;fXNUUz!>Jmc0s~YY6JL4y}aFRAxrFks6y`{erPKtiC_&iUis}J;$KLb>hyA3HL z=?SFLJWr-IrTkIL3-gBl6WD{;``3^7%`LnMs6`#*S8o)+n|Yu?vQ-^bxxiuEd(~|# z;uf1`>%RBp_6E8gTgP4+@iu_^bdlWMT`Ave>ttjRN;)^q+p+r9N15)8pxNg)J~>;Z zRnqi%o$-Rsc>I&E=T(G$Db$sw68(sEz#dS?``nuJ7$UYBpBxeOt;nSkdXEMxNTTG04cQ1fAl;)2=cYz2{vI{{RTR zzuSJ!;pd-r&>CJd??$XwII*Kc(n*qSCDX18%OC!C|#a*x_tY-bqYV)IW@gupjpJR%YSz5 zxl1j)AASC;KIpEEd^dC#b|MGUcQpdRHNNDM5;^B9gZLVDLc143({#gsGRb8KXE}(H zTOO;=`RiQ;=80!}Zeo%oQ~WNf0aNQ(N{lwVZdKIo89nQ%w73^`vChB7Gmk^wnn(^` zM*`B~Z5s101e>GU#~>eXV_coIvv^CzGFv+_NSo#V0B$4G^!`=Y_@WaH#l7NWZ}|xw zm~s#K^sZyas4sM_GhK`Cqy3(8J^Fs5^{JZDr%}AX`%t(o+F6?)l}l&fT~_Gi!?!)ip{3P{sP$)~&EsE( zx;QIpm%5CDr;`YcPvUv3d(Vvx9#=>3-IcfZWVvH255uKHru-$;F7}s>Mi}a-O>`0b zGSReT^EF*U#?TJFeZr~yhD{#g*qg_qI7|Nkh~78WzjytoaSUhpEfdOL(zBt^^}S7m zk|fhePu`rDuSt_#_-&x&Qu19wMd-G0ai8~^vo3xx+C!N1KMcU650*>EE`RTl#XHNQ zu~(bfSjw^ZYpZHZzHuiV0x$SiQ*ZFD!%R;*Rk*jcLCI^GRXx;&%LF~qlpNMihjhz- z2Uy6E>US5DOv=Ok9K{+k{{R$d-!ncq{{Zz@M+b)dIi{WRJohqsyqkZQH89nzH0=F? z$_QKg=)W=l0D&HAomXO=M`MtGhu5AU)a2DK;&{Bxp;k^dqKqi6g3rMk9+;rdA~O!k zGy2nRyhA+fesX@5&FWq%va`s#zKtLIo3}Up@;R*ThC)hbdgQ)h3f z_=eigEq!|&C$86ToBsJ6*DZJBaVr$lY-PA^(oOO&{qi|KrAwszIiBIe%A}LYE!6!@ zclt-e>w9gb;~Dx^s+>`3DIRAXhl#vKc2996a5q??9$J2!pTyT^;opVUkpz3FgZIG| z>2`V?HmU+f1!#z)iFw9-s@01|cAul!>G25Qg%A(!t}9Vf0;|b=97BqwHmFW^(V$q> zX%PlVsy7;vCP3?16J9(202Nj&Ti$_}C)6dBoE!?JBe^--!EYy&yQQDBQH;0c+v;&t8SixfSjtRZmhI)o{{RA+ zy8RGTUuU+0-0s^XdCthxq!}Kk??^|v{3%>^2_~NWF=thXaLSGHFSE(~NBxxi>c*1M zJ?ThJ{j0T~80q{Ve+v74b(*?}zD6kAM#eBW!k<6~AC*7B%ep(s8J(NVD0A5KKj+?% zb8Dg1vEZJ*!kiKI@7?@4`cz|}?@NrIS_&7Qat=Kx-p4%WZ$JHdX7h|6!j+u-xQ{8G zr1YRe%N+dN4^hrdBM-U=VYzo_uN1&I8E?y_DIG{$4A29-u&tF~aq|)Z#UX&nAXQRW zjIhBZP)6=LJzM&|2G44@vE z9AMHK7|K|e#s^$b**Na!sKq!EMh*e&JJVhfAUR%f@{Qk}8i0m8#t7#mj0$FcP#2QD z$E`gh5h?PJ0uNC`IBmRf(~dF)03#9utv^p_f2Wq(%w|@@YT$U@w$yady06&dWjx)J{PndNe^{B}#_{nT^9jOf7 zkHpTZeEU^)$Qef4`1h>{waW=q7B3b~0g-q=h|O|bZX_rJcLO~sISZT<^K(Uj?&6l( zbp{EelXpO@38J`A!_5n~4tV3K`cXxPba^FMjlf(QfD9UbaMgj&Vs=ml;CSm)7Aw78 z;jOJnjugk6F#bc{pLIJ&Y<3L0e5;>Y>U2##Ei7Y=0o{&Hda>$i;H|B zmI)2Ca!5Y&l=C0zPeENZ*M`2wcrC^X0oiZvxE*+&PtZ zBOK=?PXzFAKT7KLi|Yw2m-{m2dwoTr!FT7&KzUL-9(Vv!byT3_p2v69_=CO{XX#epGvc>+S^;)h@iGe(G-2u03P|TIh@Z7 zVryyhVo4djdP3u4)Ud5@;p2SuSoSkf!r&RE@Yi zoQ@b$y~#9mg-5?>XLfY8R+X^aHr}6Jf0adIw|BElkxT^an~$16%AUV9bF;P7b8cZI zY`@>>^sAPaM(!e9C~@+*`@5Ix=|?afI}KLTPAzY?SXcLC+mpC{9R8IZzN4t=HqO#n z$rMg{%A5xO06w)2i>oe{vc-onu*#zj1N}!q_*V6Xwl7uIds2!A-b3c89{A*bBC6;L zmL4cg?6o~VDq8>0DVXQ0IIzLS)pi(xyrZ7N7lTu$5fX|@gyb2*rAuokmEUDx_`&;r6z`v z%IF%i*lJd{y*B~oFX+Sb2EB0k8kLN;mkK0;blS`PktnQ<@vAqwR(=brRjbsf+(-z9%M%`OOKg7e*@`IX}=G23wXm% zF5~O~s#1#RE2XkLX7j|}I@k9t&aHY;zk)W{zmnA`JQu3p$Qo}k*}uL^AJ)E$k4o^y zo(3RC{{VRAp4$GIW-)DT5@Y+Q1wM9zns3PQoipJj{l?PJIbTvZuE$RJMHB!v%fz2k zn(1!5VC*p&09GCMiDcSdX~HL~w$b|3IW{XdyEbjSCE?vL4>A149e@I@{{U!d76TTx zaSw7bYs{~_d!@mh*WPn~b`W3raa{f1#2I8HS=dO#^l9I3=qd6^^ctpn4d067JR<-r z3ire;5Xm&*6Z_8LN9$ft`qzndBZQyGQ~WmRxAo-ImfjB3^%(@09&D_CibgDdTBS?5 zN}qG7z46wS2_Y_8;r{@;ZiT;Im4kERT}J4YvD4Kv{ByB${{Vc957xRJE8(2ax6DIb zzL)Uk-Vh6*C+S)$aYZP_9yj(+5o<>m7gq%j@F&Q>lGg5%@Y;DuFBSpzuX3{Ry`8Fz zM545Xy|udb7^)fOdRN0}BHCR!uDZ*@wsxv968&plT}BAL?mr67xYgX`Cln1L`deFD zj*2l&d+T_I6@w+@u1PIbnpQmKfU&hUtXm;!jJK}mmLjvuLQlYq?q;>0-W2L+do0 zpZEx+aoF0v)hu-N{{Tq1jTrsZC+2UU>T5FNTf220MX|fLZLl~P{zS@n- z3%`&z`>-#}-|#BKz41z_1=B2qn|MjD)C6DmU|t9O2-L86el?KGH<@>G{IPa9WA1v? zej>L>wC^rB)-{ghaH_ce6H4d^^c`*+tY8gYj~pe`d7J0 z=XZr|ZDEXk)XS=|Ax8r{o2RvNx(fxK{GLYm!Td7V{{Y8pZ&QTaS=vQ)HrGo#Se=0c z?p(LxMne7+`kBt{*%{EsG}?J}Z!MS23WK9Icw_l~wbyuN+@_YT8%A1L8J)rI4iELN z2UFF+U$NRTOmNShYpd`q%=!U~Ji&0_bB(#d>-f;tg{au-ROADYIPFetfv|FTs0(fR zaD95ygD)rePfC%JWbMv9vr2w$yl0w-KPRf6(wq+95Rk{!JDOlG%euNowK;K&Fvdo5DWh@@Pv_Ex1Kofu!+HAu04iu?QUPoe(;|?%f;b@a#Q=J4A7AjG z0IvT4lY{*K02*8cAP`9J&ND)c?)%4%=b`m91|X7|%>W@8I2al4Mi55ajE~?0oKgd~ zc~OkE;yG$nV#DS8-3EKm1B-)za6seRAB8Xi-BpR(_z!wgyKp{D!~h5wIPFGLCvPFS z>(}0p#*o_wP?^R^<35y%8aG!^4^x7w3E-RDjX6%fPi_R1a@`Y0xB#0(5tjl zOJ+71&VLWb>rdXra2GzuoKlmJe}|0iJktPmPzc~-9S7v(9icT7D|Oy4JKvE%6PrTc-1{ge=4WGghxO zyJ;na-D@nW>e5APJF7%9UZnT{g6jjd(+^e3 znkKP$h>?*Hj#a$$2k^oERmoiH*HQvJk-PXye8g2~*HM=vZWq{MpD3=5isn^{LX)f@ zle_1A&GpMv|RYXWs2^?CR2=ado~ zi9hd{!L5xqS-8{N&#=*Td#%GBZSLC`Kk^~me<~MtCek~BV{sy}U>7|ydRG(TF9hos zT8+iknnf~6^C%MSU~$Oii=P#Eir|oSO&?8M{o6Psyp(^yn`%}LrEv_%Usu(&o8;~; zuZ65X_wFoz=u|mD66TT4_?F^m{vlZ<#3$_+-d&XODGbQ$KnDY<2e;O>F0}ne!&k*7 z+E=DgvJG-N;EsE%XYk&c3wdk{J&Xi}+QGBtF*zsKIj+uqZ^T-r;b-BCjjVqC=+nM` zMgIUA^?NUcT1CDy3ZHT_Th_h?(%{D1Wc;etIHB0^13!y3GliB&Q1*g7pRZb)-{JnT zs7b!yODiAZk&7SJzL0~%mT==`x(~vy$ELv^K2Sb_tt5tc9f!hc+a7Nf)#?8L3hiRX z^#|~;R%Ew`bAk1$vg($Q{{Rp_N@LXK?7RtKYz84;#8&JYC7rVHiZSg~r`N|mTK+Yk zHmz|l0k-`q1&)h0w&SAlRi)LTj1YrAO6QEO+_Mu*B>K}GjXCvsU+(c#WV(_=^Es#* zL|&$|FE#x>&BSs}zH5Eh*XDEo00A7*8Klf=Cm?k-hjXo8k(lg$%!j16@(4%Zys`YM zp@RPYKQe2pnPK&W3rIij<9x^P#Vdu4d(BqL#dC9U9Dc>QKZR#Ytxoaowz73j(_Oca zpZNF8e=4V)FG&U7<_KHA*wZe5?X;Z!K-Ob;w%FVh(lqwCxc>m2n@rFD0DU%b_yIyp zX1I?2`ZFus%<*^oGX0(}`}+K!!j=nLJ0~uZLX!73k?)ZGW|R5~<1e+HYW987+^UfK zSR|X=dlt#Z(z;tM!p<)-i#9nYtrqSb4Ffz#<>I$bGyZzaeCaETe$Dnyo%_G7N$9=dV20$H2^cBT;gICqgp$t0B&853q zfh0vyWp(x{xg`4%JQ*uRsQCaw+uH-n!UNal>V2y_#L6x-`@5DqRpwR% z;eKP2{Ea(N_?3M2@>y82%k%x}yvDmNm*qWS;Bo8l}p-r{^fm_eWlR1uI)o zkI~H5_V*HbU^LS2Qpbf*4iD>8wJl_^WVU_p(979v)wTA#l*4RF+YYC^p8o(! zsT`!!NDDuB=O=Y{Uk`3K{{U!Pu+BD3)9R=CR|bTDE3EJ%n03jNa%8q{Jxa2l@uBPt z^ipuyCnR*oT2ek{&T@TwQ#bIta$C6v{P&|Pw3p6DBNZYPoy)+<$6rcz{4pe9zV3Pu zf6wVnXv>aYfzxstO#HyNP(QoR=ij9OJy_(DbI2eQgO5sIGYgHJXR)Z+z+&5Wk<|9< z)|cnO-^Kl z!+@-M6HxA8OXLjn8SVXP%D{5R2d2|N#|@kU51^+;s;F6*7GA`4rfwUE9DCDFw7v2G|*bx#MmRB+@%!!70u;&m>R-fhUoVmy$hc zfr-Z?W`Y+OEzUa%MN&f&Fh{)rDFcQL*Ym{(DpM+OS0e+80=ez)%>?Cx9>bp4pa8J@ zyq+_-iUT+e(*$$v>qp*DypA)?MixcD1xN#o^`K;pg+O);_93(R(M? ztykeshxHbD?4$cE&Ci)34(u>b`eTKBO{PT2qLQL zpA2>FE#!M?h12O@XV7l#pN-xon=NIxK=#X z81*aRbkl#rF|Xm2sf+BTm!HCvY4SI%?o~`gY9)OvXDl;Y+WzefzH{a8AyR}GW@K&{QsB)iZltKsMS!*j&x7GC&cu6FDA}iS9cVb&o3@v7U0YgAle!&^6Jz+7JJ)5KP`>|@a^tzvG=rRl>zm{a`eb1sCsosnh@O4{%b?9C@$)W>JH)BK3f z{0>vo@Tr-#fsy@{qv_+*7UQK;)%iV%yV+49_HR?9r1<~RF%M7vJnV>C^PBdwn9#(L&qXMGzgMK)=eG zjJmM2y0gu!pIiy$X}WFPNz?Zh@!6sLg4LApuZb^Te8$GupW>C;cm9L*uSyUAdQ<$7 zLZ-ThA~+a+AGDm`Ygdwbq#Iyg{B&patqWfPczVtJ#nYmeQU3rfC(Nz+wrj5t)4Y#r zr4+UrNf@c6Tdp?3fDV8Pw3cc*D6JnUM~ zUbvGxFru=KdXrH*qV+Ufb}Y+#JiN4wY7NZCt!7mf}A!^o$+Oeu_D$Wz{d7 zZi`Qn7|%(sTw5pC4f7wtX$6g((`C7a1>Ekcqp2qXx%kcgph!#Er}a%P)UY2}WZ-|_ zG{>;Dfs5Szf;kuXuO(~ghx`P5gYpz0G>?ew=5m%1&3xmvtYyIZu=z*gYLxn>pzX1w zva-+cyq>eZWn;*Ayr^5(g@zuVF|CGz7RNaOzi;;R(+?)rCYT@J{hJZ!Uv zY5aEOzcWoH#&r^D^T?a;n(gCm_p=K~&(byd&*o~Y>N-}D6EE9VA>KO(g}R@tjq{(% zxU0#vMs{siRJGH$6rF`z({CHaM>mLq4j7D58fk_!N=i2hl1j%w=@bFM!5A%)5~C3% z1*93>BHd$jH)Fqd?;o(bc731cj&nXIWW!l{C~SqIsZGxG^`g@esHku zK-e57C}3`jkw*w?vc6xEs!xEjA&`98f3`J-euA$aczE_vmTz7aST`fg1B5{DEOIoZ zB{X+t6`$sT-ZeRYn7`(9a^FZ8S;R}IQ|#D73xkKRGXEY)l}l#d&MLQ!{7!MF$o&T2 z?#I-Wm&SO-cZxUHs0EeFIs;jp4?T<%C*J&-BEvP6Vvb!tFYaWf7H=JcX@o{ zGGQISC#|#?dAh${V`&sR2tG5Uj`1K=P(i;5Y;-`VIJ8o0<3(7)`i@F z!rI|QAJjx)oD+Y)O3m|b8I=CwiNGa20v`cfYh&9R^opRFpf3+|Kvn8pWlz+*ye6LwxnI1 z=eR6+_fZ3nO&DJoCc6>;ldBnP(gTwoiVkJS*)NvDYJ`i2ido$2$4fBpsOZYyTVu?n z&Avr;P)VohT6{ay2)QeJJsa8G?|w3$5FYlZsbaAbHmSl^X`1ix_oXXkpw@AYs5Z^# zEH=@&Ei9L4WcPG{jD<96^mpdQ{=tMU(A{Nb+%xH)*{`ey`Vm+teh{=f^+W!%sL!BH z7HtZ#=>jeJ9lye(Endrq%eA2MzF{RgOWp&YF&|W`OsLz5=wy+I46s*(&ZD-;UR+Xxi|%n(a8t+G?s6O;jD(!m&mM^TQyXzL zr!T^lo4#BpQ8SwHOk_Umf^9jT-F$f@iOM|1%6zaHg2NV#>Z!uMGy}?)!ka( zLNRTrHR;mj5nBHODBy%lp;brx>85QUVsb1n#F^^X{aj(XFfBW7+Cp&inM7sMOM7DD z;Op1w>^G;s!w&|$7V(6O5%dnN?5Q3*h|WWN;yYOup~3~-JV8 z{+=46Pm>&*^=}vih(0M-)8qT+0;a}_mx#Y)Y<@Kmu>)UZaeR$L^_w9igS`=GGAx!r15nhtkAJejSx*;)QDo749pQfDqxs8@`KrR zW)qs?CxqLg*2=GSKyl)6zv)G@C@nuHyzpmj_=I&Bj5<2g3fr!mz)P=Qr7e*j*R}<` z4^lpOqoSA^WD3~hgX+z8%g}F(2TS~Wi<)cEs7cs$cPTh`o6c_QtdwP6a(R|Ien8z! zwo$h=!$2AQIk;qFuZ3WB{vpl7WRKCiYxRsUsmiU2?+sQ6wVLYh!s9n4J1PUu68UC) z1&9f=8zibf5Yk$?H18xYf zqW_R1_N8oD#8gd>z2;i0_~?mHk!$~`MkSYX7LF=Y$!#U4I5El~jk88rlE!mz>HSRY z44l)-5N(V9a8it7s$*W`RP29%*lfH5qs-|d)Y5)e0{^&;sRG`cJ&UR=e2Y${53vfb zK&(A(dHlZlb@_?x{0j@RzgGs=f9dUAU3oGtZmcQ_3Kp7Oq``;*K5VJ5r+X1F`>>8s z;c%|_WX-w|`XAufm^%3KCbA80?1}mUzJ3Q>#M2Uj)lYEl+Q~ZJ(n~^k{V0oJ(?djV zq2?*LSm*g&xevoTS$ozjf3(m&)oa8lxceVd$cG>wT&nz;l<9;h`5V3LaL8zBak57r z?br@;Xfl&QXxiYS#NdZ;F#cu%5B#XYIv2f?33^GskzV_2(+|6hSCyb#6VrCpKW%&) z#y_uz1CBf1%S9?5W~cUl)YbhQ1@t_;ro&R4gbA7+cj=VZ>eXs&ZybYTTU0q3>Khu< z$sOhbC?l_GI!i9M%^Tjll6_KHA%oCmo|_iuFxt|bD>#)zuCl(t=mcF(#ZO%CHWg(- zNT3~~UbMObX@7+aTK@9nFbX`7?#K5K4XkA)26!Ep1K+>^x}eng0&B4?zyUs)z!^Qo zL7L63r+i#~*PRK_=a@A3lK*J`qpb=*TJ@q^x4M62H?p?eSx&6|c#1vqb&Rm?T4|&G zWM+&xV)e>zmD`MCwlH^Vsjc#ZnsKrg@9^mdK?|%zo2uE=p^(JYAW6IILe|+{6IRF0 zAF26k?7Yj?St3~w%8^>g#rb@Y_8Bxmz`9ec-=}o>>Wf+C>+XsDIU7gs>(pNj)nm91 zeU=M@K2XV_gICU0g$TH>T;}g!-1w>NjPCT&*K;+dkcIDu+PT#_n-J%>nB+_AZVwl za>6f02Q9P?GGCVS35av)AC2S}?_8uk8YrsTFwr!)j2^UYI%mR2Zkf%E#7e^+sJK6o zE#>a1P5xa@euQ+PJ`I1i&g%M*!l9|oYmC^!}felMZ>>e(Fg#|9&{8GbTHZFx;D`hoZc@RD8L^i6@&41 zXxzcf|0BkRmQ$tm?Y5oXSXCSaz)HmYWPh>czKUBKan?Iz7M>VOfPkb^=f-tXO)(BIv=*WZKx-AX&8Eq+g=qtfp_LoixNteS9)PzcDz`uMQ4T_5|yeKA7Rlh7H9BAT_vQoVQ`n9plX8C5pOk_62CQFJ*E-9=k!@1x;x5W zgza{;fBV~ax0T-cBr33SkqJn-P?JYXuw$4X(=hOin$(*>q4iG+ z;HnStl71A(w5uC?(&Zig^>%SsjrKM5V6J|vb9EzO6If|Bk3!m31&4(6aM(d>F>)v3 z6PfWH7!NfW&>de-hh;>^o;m|ylUuVTJhxi~m*UYO!$j>f7WT|TQ)jXHPS!BRmhE}z zbwXGXK-}jmf%fqXG!Dw~+R<7!Ag|0#Iqj5UgDrAdqWOBmk*Jjj2(V9VRezP?VLZaH zvf*%=3pMy9ZjYyP$SYD3-4EzzB0J7gBBx3+^9{9`Mfx<-Co$~j_w(yo{9_^lzdyt0 zJEMs&ZkI7F%WB&?+>Qw@S>6FI zFSN%Zr_=|)Wo&%L!s6(a{{W%0^|`%L>nBCFXQ_jhE!+cyQ<3ZzYt~eNZompkeHtM;afd+MkF*nMYCy3SjOMmt9G5~K1oW&Uo zKcb(*hsnM;yi@{5f#;=##>>A&E6@C$`buVsgqr`ujpgY61a~G59ZqEqKSDf(>Myva z`S!hX{k(2%YQswW}sppgAuZ zs?y5;10cv1i#TR>OzO`i`i1EqRYD&zSK{^?LOI6vD8Sy!LoJCxwklO&%n|#lo#iii zVkJSR$>xSw_l!af#>@4cf+9q#0e$(p;^6#DA$F_=54XfS>RxTM_N5r)! zxpQeGwtQK&lG1H1a+Z{hxXOZ0mn~f#bgl+h?+b;QtlydZnTP1Dy>OHPk%jW2=Xo{s*iW6M}#e#k8WGGj>H#s$Tr^ZC%^sqrfsNe zs_cmGsbEX_DTqIo_`4 zEh^fK%|}cq#_okVse*v-uPPAT|3vogYs{DojmT8>S|~+bqZs?X1ihYLB%QuFvzYI9 zj?RMZE4Y0N&05dg?sAqz$bYt**xMJb`y%v4)3&89{u8HIvvx$=7kM?&c1-5{RnWJf zyH}>4nAm@w685!xqz$7q{{Y^jd8GF6p82p}2vn@!|LD-yKzMO5CxEQ9{AdyNC@vWK zUe|#A!(t8=zWsWjAj$dsvAUfi#Z|7szZN6LR?O^>_=}BRguMr8;Q7U7qs6-U0pSFRII=^w{>mc?tQR~k0G6Mq}%ag$JNd1Xx#b4Wszo%*nxGTAMM-4g1w z9qSr-Tsb-?XW)bPiWPFrQzhRdTQ-KcS~dy1s28_AYC&}E`27cv{(pYpFU#b&DG`@& zqT?!Lw+F`&Jld^;>ExB~n{_T2?p>3A?m(stWi+E4fbU$t5AjPh*ygOBJ< zAL{H;fAwBuiseCW8}T2T8oZ`ajbk_Dl9kR_Ge2$kfoFDgt{)JJZ|fvzbZux3++vP< zG~lVD+UGn;u85s;!H+g&NWV~0nKJKrbnJZl1xoV!Zi7z&hNv1eC0sB_R^7DY%r55e z=>yZv1@iU$ORv3>Rh9Vp*kTv6uy+Dz-ejf)jgh&eF4Jhw#NvP3iT?TV0%^ii+NFfZ zq|YMruMee5%e=NPIk#8>jPPT*E)E^%es|Hk)d*bPyU&7f1V-z)eL&#=J7twQdmkTa zZEo4iam!u*&!s6{EM)njW-M~%tSa z^!PJ6Ji}~t_jLJtndh~J+p8Knp>oVA9^?jf>30XuC`(xjB+4p2;a8On;AjCKohZ(* zREo#-(L3F=%OlxCvwwTn*-vqS#@|kv5A^ESR6JK$|}#P{yTv zGFKE>{b3egD9vyl{QkzLzwBzAojPbT@d5k|N{-G2kO43Q8qZH1>^Li{kL(Y$oH5-P zvt)^tvig%2E~bqNEGK*nXQS=AL5w(=>LA-7=fxP_P@twVOTP*6%`{5u@*zyN_5(Cvh@ z(IDaS>aG%PJMsQ%>vLukyKX{7Y9n1=M_4i}VmjRm0+cx%)hhyM!{HfuJk`^g0YsbN z=aWapgk;uAGhB57HX-fe^Aq79hX9*KoQv2NCZy?7yh{ibVtucMGljY^1hSm?^&oUl)$4ca;s_>k4Nrpz= z4Uo?aSc{hkbuE!}l2hwQnplIhO-NYHWHh7VryY^Ds}x*aKe_y2eT`v1H9C=lw@*76 z{(NY8I%N{>&1X(AHw?sWiGj`=+}Ck;^>k>1H78oe75seoeGR^MJ4`Dym8a8tRi89i zODFCf_Mvz6W`YV)IW+F`xMDzFiRRAb?oUbMCJX%b%!S)Y>3ZKicX@Cw?0ACzylp00 z+Pw5ekN-cw-c);&YXxh&x~unpfFtTK@z=gL3E#aQjzA&~p#co`_!)LZiEbtxHe$d3 z+_^~cR`N~s$uzrMHu$BX>^E-=zAv}eR*RVOK!gQVI+-qYIy-t|KZxE;t~WiR`Qdt# zI*77f(X|FYY3aXvdXYU&+Ot->MZ< zAmQ%+u~%qal^>k(i+PoWQwmly=R_`6eDsx&vFG?7gsD8}WVZCgDm~*2-TJ zm*iE_gS9^M$eOCP`PUP*SGAYTA6Zon$cc5wzW*lGW&E2#Ed_y`UsS!;Z;Y+FAI3Z1 z!H_V1ofI_})!;r_GyEjHD+%{f!?H39+7D{7QMMaC%xo&@OaC_)(#api9gR|AatZNy zJ@J7odTnlT;)4N3LI&~JcKK)I?y!%TtEumV@JzmF_0U7J<&PohAUf>{&4@lZHHvr7 z|1!mFxCV7j2>toWB~G$%y)kXsQGLK<)A<4=eCvy6r#XIn+~F3nJ5hB*&?eR3mL+f< zGfwJ~mHBt7McLzCVAHVupCU&$@&yXKyrER5S_a${^`|&353f#J5OfzBo)j@?w)5h1 z|DJ$(J!#u)Id5T(dfX>3_5CXlTxuhbp71)`5V!Pqnb6r(j`mrWnTe--*797Ejvt5+ zhju^Fiqi+jv;$w<6-H9IqvW0p{M=wgExHi)#nmw752VM6}p0BQ&@{l*EGKiulNtpxd zphxfIH{5&kO$@FKEE^UkaihN?zkNIRS#&oF!^zE9V{~xiHFaEjY4o0~+*#Ln-y9m8 zYF7(GVZ2dGscND|{7JS0@v@bsCs%7hu^_tNMU7zPiTpQ6nZ=H>JFbn~A}!FbkFPhm zJlXytRBNHB{N3wqfj?sadx8p<)F#AszNvlZ7)gj731FBcql6{)@lOJ`5a_V%Nc)JM zJw^^yZD?0?{5HG6oN3BX7` zexDl{=V^iSSqkEyMTqVE`EZ+k4=L7tr zz|!*RFAqG<=jkuYRPyu(4`xlD-wEglnSxR^+F3M7pr+$m!86>voxI733fWNnO8cF% zE!LSD$_rV*3r|BqnUf!f@BL)07?2ikXk%3z)yf^T&XLlG6@Bx%jtkTJs>b}p^V;e5 ztItX5d3z7Lo6U<8rMsDS!74AJn~&nyrgFXq5-u4kEC8NhJb7{yqwW9Kgv{lBJ(IspJsx3kAC?-Ikg0GFVf^=W&c<@fK2)}*m(y3ai7kWs!n-7f^grt42WMl zQGw1ncH@LOCk6^nmbxTP%10*09FyS#`Ryki?dabaaFkrJiMQ4;?}g?$qbv!4r}PdUAd*hWsUBZ10rZ`1rk>b`+USDa)H> zxbG$nN;lXj&TuJ5z8KeR6pYethl6S=RPGE(CKul&A5gedcEv4n#*pz?9dwGH=D)%LfmL*K>7s^65sm>*dcBcC~ zzG|L<#-ZQXJX<*uO<7#=L(@S>QP>qNfAda5{H4&}uqBy1+Nk383aii!8c3Cz1zrcI zqIp<2a-02UK-8kaR8O#@j<)fOIu&i|~8U5}1TwbPLof5+*^IMFfe z`N2@a+%$Q=O?V3ux}z(i_(otv+xIAU*-|)!l>cFCjb39s$4!`#v23;~3RWDJHotEO zv)`D<8}ZkhJh6d4H>S=7VyU?GyK>ihH~)X!K<`}Hcf=d-hWPXR6eZ2l0+!L4haO>3 zdRGRv%#-5c9&J-pjx{b7o^Ez-JrpT7Kk!nj5vuz~B?sz0J=)(x^9?2OShg*~?#%dk zIO0CT|MmL@8_PotG#v8jnPwU%awun(F~f4SdMblWGCVhlmyo`>P^xQZVaogZ!0UwA zS~t~y&PqFM`WJ1x7hl_~OUs~$wQWDri{*RKZv?Xb zy#Q^w8@|6jUXLmv_)&+W=fYFbPw^BEz7H}$?3akGu+GeHuPOT`C=qvqPR2Jwy++sg z41uI|L$%2M%mDB#3m}NZlCnDhow2$IKPfrB9-9yPDBa=R zA26p7DluI*muyUFu}VFWx*s?>^;V3jLCPdno-7cM0a}*x13qT%wXT96uOUGHcbLH_Y$naq@jTy1;!t+ExS*eA%?7O1)8I)i~eiJAYov0r&P$ z@x`|9ZzKeajv1MeSA}A$7s2?nxv7H@}v)dmeH06=;+DA+!KuTaW4zzyWG*7b|xCwHnqsP!s{6%N`jAlbHdrMS^1LKv+VdVKTae<hy6aAlrORfD*HDl9LJTzWA*}9u40Gkfm3hiE& zDl9FXnX5HhF6CakGKc+6C87Y~E;dasMo1i4HhD^mztu~azl%WtWaM%~o&7qq8BA@n z;epPGQga&d_mO!8`?9U&0>em@DAML}OLNw}#A3~znTlCvHN`(oSR%)3li9pSuXWd? zF3w9rdk5R=6LU>DoK*PjIL5@mMm-we)ou|am%FA(6B~C^2^`l`psYFiif6Z9v0iDcHZ$tM{VsFQaAtQX2UT4vMsxE)xEB6k@XD5CFwm; zeC)Deu{b)k=dESixBbmm!ZNP)ELT<=9Lzloq`)5?x9W)_Hg@ zy9cPau1EPQML2*}-ubh~gvLc)P~hZke9JAfuYyO$Fwp}KVBpvz$PQH3It3QoeZjCn zhL6VqNUE#&49lzO(n4=0Yfh+fjO=i^%RG!u+(r_Om9|HZyR=b~2_UL2#{XD-LqmS!kWSf1LictWbG`})?$YJi}T%rCS zul8%b0fpA>PQ{dmhC$HW&O-88JkdGgBno5*921jG+1~Ea$}rT__dS=JTr)l;h1PD= zkTEH47)Ma?zD0aZFPVeFtE(BvZKx5y+-GYKcsxBZh7wLplfFzr_MWfI|61r;mT+Y? z(B$+Y;Vj8ON4E_K@QwU70m#Q7;KcIU=N?@ndl-KW zdOu0c_ie}E_bOkUV!S{?SF}kz8Fz3~vg0wxi(g4SrL;CXDpDZ0V7`bmS4_OA{%tR( zLE{Q75_?`+FV?90+(C>FP=adH%`Un3R@;Sk$&3sHJqQsuisB!z(m;&(B2GbpK}NKz z0>nP(C@#e%H=pwxV;DKym2aSJBnB+ti&|Cc6zgQoFx~*P@1w?Uz^8FP2JQi3glfg# zhF)5R0*h6?u#*Wj1~-~qf14|c_Z={p`kcP(Dc`iML>b?tw$c}rzV25{SM>7U$*czW zY5taul%!YHI$HfYG_v1Wmi1n7UUDS@ z^|M_a2P01=F2HfZ{{LZ5T4reN;IadsW-u=p_ZvWP9YCDw=GLng_%4-GslxBHefC$w zbadxo*L^w}+~xIo5s*^+luwS)`X2*e<2t1TljG&0reEmB(T2DzmYRMZn;6aFyNB65 zE7WUg#}-0!rlL{Fu;86M4C}VZ51-^87g#=(7)K*S-Ami1!p~CscS<{fllOS_^VPf{ zD8;9B0!J{dQ+R8flRK*TVKSOMcnIuyln@*kphf1Km6hzZ z%e2&}J_W6I1 z(Kju(<7wOv`nI|+i=PiXnVQ|A^D%Pw6iJg+Q^pNcO}ZVK5<_(rsU}93mC&6slbNK6H^1*zo0lq3K};)2$faL=&gOnnX-Zv@nTb-a z6c*{J+#&q$TT%lPmtP5z);oVPQEjtcGvrtHqreG%Y5D2sh;z^m~$C&#NKq`lku;c@?W!mgM3m<^{qF1|3YES-w9*+B8;jCxNqP z;Z21-gehKu`haH!4RS`|n~_sFYvIq8E{jlsUY2JJ`C$+CRmaH75jcjGyd%hHpb6Q3 zfGNIPUFr9EqzTG~dJL6=zHCz~5Et(%rSn4#Qg?iwAe8Bhw1{d%M2{=Z8uLpDaf;|7c|ymsQ$77%e0eg?WE!q>=rwB~O~Mbmg7@^t^O&Ycp5 zKq-`cXY%B(hCLNc%z#cs+tY#vhlUwpzCV5z2lfI;zEvCTZt;y5ZyH*~eWV)}bdfnB#5(aewt$HnZnacf3cTjFV11?cwclfA14c^&kW`AdPevz*HO+y7D`=(d ztDH~Oj>Te5=6$fuil3%-W}A|9dL~^CxM|vNz^~kxj0A!Q(0WHWF7%FKu@eFJb)LU3 z8c1r|%i^7hpf^ZqVzXiYVOhe~Hj%~oguIWwVIUyB!M%Uyufljol!8|baAPGXG$s5C zspu&Hac~_Vs6IA@sM@rkR>C$~eQC(@hHB)6SEKi-CW*j_`)8#l+7Pv1)+X&R_*pSw zWSx>*U#q-m9KsTdXT#Dbun@zmEBWG|r=Mi3{zBL6qTa~9L)3761rwOF4rWCGSMcb> z8at^4!V>VZkLgCh+0{;%UkRw6`uGW>}cnP=8SQk?!ZSFRd= zXl=VjQ7^FzGUu8&Xfxb=^=J1{o9Jz=gAG!lrNM>axDH6aqpo+9M{L`&O1gJW_WxUi z*xti0t6oIbI8NiXbmYFU;tbflcOiic= zJXs0;0U34iGT@*V@e@9HCUPM`O#n?`$kpN-W^OdtBt8&(W}w8z_*5j3zeuZE=WQ2s zqLwk*AkOk#I4_{!sE8=j(npCJX_wA$kIV(2ZIIwR11taw0i-LR%Ja<(3yUjJ6SeY^ zW+%uyLx%z4Alj+WTC8sol@bj~fH; znYO31%n;kUvkfxX5`t#`(OsoC>d3IbMBctp73UqG&t$_yBK;n>&*Z}7$HZ^HMs@jA z*7MMf$h~(1ky*ZZD?%?Tg2%$m1wqe=$IBzJa}NB)`k2afT&W|_f803^Jc+jUebaQ=i;p_op*E7qMy7wWke@MHv=(oZYt!2`bP)HZ;%#T~Y6n|ilzm{c%uoGcwmC!V zd%K~}*%TkQ?`O3KJx5gshqz1-Yp!!YN=+|M1n?lq>bTx%(q$O@T=_h%pnW;~%7pYj z<-TpNd1Q`a7hdg=_+$+%4sQ+*&3m;KLdJN6l@_t~Tc3V~mJ0@qfGpWSZ{-X% ze-?D2NJ;wZYv>o>JsUz4_5zt2z~x}fVJiHRm%|@*jZY7&a4~+Qc5U2na8o*po-2&C z9u%$^WJfPZkxru6gA7v%QkNt}j~Em}?c_u2_!HM@p9qWovReV;H$BsN#3S5n=7z8) za!sZ^nSCugm1?n+Met3XgX zg@M=yXbPuFhOsrS6;T?!vm=n1qw+@GtVgQ#EEzr)I72 z&IV{orN_%T?H7vF0lAp0NiG2{m7d-u;@HF8i9)NB7ez0xOZz%k?B(%@eqc3_5O^P$ zjD6Fp$PZ9ck57j$CK2A^i-Lfe($j`TR==?G?HKwWB6m}YJ-za{)VF2>86Bq%9|s(W zil0@ZQ4`@Ds|o(p1(C_4)6ouGOxz!MJhhanxEAY`cKdBVLg(dGg-_l&Q^T`tHfZ&# zwEz!j6|4^(fvDZ3T&$AEz`Dyia#drz)`dz~-Alaz8K@?_^L0=x9_fjEWuwbz$*Y#k z6Onx9oPSX*nivF{q<}*Ofh$LpXBYVin=+_3F_i#F!(gIM?U{UQ>E5z9_PT$umR6yu z81@H138ee2<_EUK#4(L1`lBelNc}A@7{dKOI9Rnu8|vx>k9DMa?M5kzO?Xg5CLoJN zpiAMUn<6xbQI9EdhTt+icD9s--xzAr&BKw9cz#A5OrlwCIvSYwJJ0HXNGjAjOv?sP z_A{@a>vxXmKZX+XBoWc`eT~Y9h9O>QMY8557l|ushR1FoyaTZ+y;er7$ zM@4}I`M|)xpWwbGBh+1?1YiiO=rY?me-$vGTb2=Rpgy3ktlD19@Y=w^bWr@EBkK_q zr>cc`(%y%tSMX-kR6G*AF^MEj0t?9@ZFJAbXSOf@)JFQCl}*!auRqhIo-y6lRuS1* z_M_2ynY7gLS=G=UDM-Poh#617^>g>#ai2R9i@Aws19^bZcTA`smy2hmwx9nH_@r23 z8}Yd|8sy(i)n|Zd#@zSC^r|Nrekd)t;SV#K#y{S#S@llK$uoi#eC6H_giI93kMJ3o zbi8=+IkG{v0}$wNz820NVUWZQv>%u$@;~hVzmFhaT&kVG640S0b&G@S>dm+IlD3zT zSsvU>{q%6geuWQaHxi%i@e+Lgc2pR=vrt0Dbk2kP!#897vmV4mVF{e1GzJ^t->8%x zJM@BqkK5}W0MhGX0E*>}m+Rq6&^p^H;ukgo*b;VFHqf=Fqx`6HSw69zB z=qZ?CN#xwrUUZmIwz~U1l+SumX^X8Fbj7k0-IH4OvHRb<9VtVP74R(~VC&-o9XD9q z_T7FXD2zWb$}Ul6F=W_Hr!yNf#v4@b0K14A5aygCz|?#$zKnMbH1G_$W@ zchct9R^2L2ogq8Ia2*Fmcib3{D8)%Erf&DOFJ66Ei+Q|}p!R{oET09cM6K_T#Xcy| zFqD6BAn!ab4d!@ZBPDH#&t^G%p)2y**V=|OQK z!@6-iNAZ4;p=z$Q7pSE3Cw2y_*jMsf%Y5CHVBCwWPOJLsCw2gtZ~N}ywh|#+%!Is| zD!Vz+T{c8iCq8aNP=61faehn&>_hf46@I|Cf00Q9H;zS|h}8G!J31|Z69*T+g|b03 z%}B_fl>7(9=!MWX0r>c@iQFH($O6HyWCHDEhYxg2WVvMs?(KV%=>-Bz25Y)#x>vme zVfsZtJUw&BCvwd2{)>DlEYG%iays4T5tvt4UE7FYMFefeBmq5th4&qpgM=^2Rws0(=f%VcgDu$k&#d4iRffO%71xev z$kHrk9=}QRSFBT0eHrIUIcCO42b8#L71ggMtw0K(ww0&_hJo#Cg;a=%l&4H`p$y@0 z=OTma$(pO6$&V|U0K5GD9tdEDVFd{?`sfobzeR=EzbKj?)k>sJ&Ix1o?ewR*`GBu{ zg?jlq=xiX69xX1x8I(k`hQ2gGP9r|$`=r&MEd{Mq_;4HlnkT*BCy@14IrZ#Dxexde z9&K+g6jgetYr;bYn;Q5w?;*O(3f0voqG_ZIDIt^(uaZh+xRiI^9ZYJ1?1*B^{J%3n zdg*|d$TEcmK(n4osJ;{7&mP1m_(<4!+wA~fLqSc6{VmxZfleo~f69EG?cXB6NK}bE zj)8!?;NS34@W6JJd}@;vP!wwzbDG4cz$rl#i>_bb#L!Q*UNJfz80nQalYZDi^}<2ftUoXYkRV=&EhhB`jsU?R|f{Ko8(~jdo!PU(VNy^Rv=d z66(Dd`npQ9h#c%x>;s9os&MxOu|{^dUvrxXtZv~>pNbUQBfl``;T19qC#e0ud@It|UqxU2ei|BA^KletuqS#mPCauc?eF078 z@2et|A0RR$O>&0ciAArKnnnkn+x$2ib*WwQ?qqRuO`;&nv&QJfJv~?Fu_0@)#`m_u zMepMu8fJ)g@zR>*D1z5Elii+=EFnI5llR{u6)|yj=y%(&4vXV{8K(6-{IwoM#b4KJ z^&m%S4zitdMsWYhD@*DAW!YGib^1TXXP=gD3o!@f*Dqn0!8wYQ9@Fk62Z(a%{Kp|3 zH%4gM{s&SFJttmYm+0OoRs#5i?fDHY*mMRlr0CJS{kZo}UL-x{a(WY7Gg&NxVz>mx zzhuhTpD{04)$_E>Ot!DE16n)HQH9mq@e&p~nOKSeXI~3NM0Bh(3hF*S4en=ZcpJ+V zakXJrAto?nD%kx4W+RQ?Ui9U?{Q8nFrRG0?ITM~J_Ua4lBX^?0Q3f3G5z;HWnG?(O z)^VU0g%0V0>!ITGp9i?jN9K*}CCFT^gS8UvWRqFq{}k4Hhb^%+<&poL*5=`by;NAz zg-+eP%Zf@hSW>YvCsf%$f(KFrBBV~dkZ2X@Fx{Nqtg6Rq3kMGG1UTDR;M4fYts@Wk zx7&|+$X3{>RuXql?}W%V4Pt6WR9jdj?LbY%?#aWILPrkhd2uuLaC1eRSH>Q-_KXtX zU89<$mnf?LGwKcWZtr3(z3*gQmmA7u^H9znKkaqL+(kiplxE&1K$KYmpgAl@r47J1QOE{~ z+Ey3SW;hV_sk7igBB?jstx6(t(an}y4j(@mWDK(--_*Di-2&btYi056%`0`I!4>Q( z(EwI!{>I2Q=LRAOJ z4`tHEO2ZP!gZKw&XfT(u~-9U)!`tHl8 zKpvSnT0L7F!Y)4RbPOc1N@;!5V6pADIzCz%9h~K$9%Vxsu;*kU*Fcv> zJ=;%KClEE?G*i=Ijn_}K@&c*_37%g6j^-aqJz3VfE3ve?{$MsL+oZOfq z^4uox)o=`8#qoy(KJ3wv)(1R1$`Vs2!fmr={aZU`lUT~{HbP^#jcUZt<94cZ)*zfZ z>Zb*yz4(Xaipa>4ZxtYqLJFIAfNJGoQClJdB@c7%&-t>?&wFhv@yIfeE{G?b0g7G^ zsm2*Gm))&NttSNFWdcrlJXz(BZY`w(Z}9*oD(n`D)mdGlu9W+&i#dTQ23QO;n!44O zFrS6)p4^XGc z8-<@KV;iwMUJ*al7(CXZtOoqKi7w*-Dn`z|D~gn+s;W$i_+T)Ib{3;ibIlIFi5ldZ zv}Yw!i7`o7!7yOA(P-kaJVmc!^RN~ocxGg2X|s_2QzFT=H$>r#I#xf?>No9IiNmKp z7-|5<<-GJhD7UpS$4Bqv;vmrrAA6Rgw68=j=$6FRL^pG)WsqXeMK?to_AF5?Ca;rj zR8r21t3=p~pzi?xZMPy!^!Fj#QxnxZ&V9DwBI2f|GcsHdRhfP+?pp)&9xOPoUJ+*C z#l_=G9m9S4;~e!L;7uVwdgvvsjK?F-21fCoAN&nN6$I=c`x(Ok6=hfy5!@2L!2d21 z`j!lFIa?1O$*MU4_LfOus3uR(N|R_+4pnAG^gQJGIOBCeD~M6KbymE@wH++@^X{PH zS@jCvRv9B7=iTq>V4v*c$-A8+3*kJ0OGJ&uU?lwZ)r(od_yM#4FqGF5d3 zw8Filjuay6o-xi9jC}SmBt4T2%eHiy{qA5gGTs!c$aj&a-qaPr^r3x0T}#RI?TNix zjsDUX7%l$b^si4u{FXZa+=G;+nrn*!lk_zP2?ybBVXsncmmg)k_7QM@_qK~*e_Ls< z55)QBa!$sktbje}cCybQ7OlD=(n~%^%D1Yo(Z0|3%gO%KU1919-R?f)tOthl3E)R# z7jm*DpNUsih!z5;BD6HVsgDk<-?la?j%1@XYUKk{~qgA21n za&!X;;oP3g8pH^}7+)`>B!BK|uwyhv4s@Z2DH1`oT8fDMsz=)_#+(#UG8OrnTtnn9 z?tG@h9|A9iIbhxN;ruH5V0dXMkLAellimS%#u+$Dqo&4^f=ziG?8p+LQ1^g?D^NBqG zYDj=}reb1uNP0~FpfQYglkLof=rIj(_;*d?@d?^-%5Imri0FWv>#mx z*uz;tSulyFn?%^&AR-}yp@%nLYXe&-Do;Z62yHxH!|;fw*&R5_?pH+eO2UjXmyc!K zOQR@Ci<+01|6YM}Ken`8;_2gdco2k$*sWtbLW%fAGoOi~G;v>-pt>H40pn|RET{?e zZ2boc!byw@*%8UG&~_TAB;jwRS5R2(PZD;~E?aN&AiHqp0s9$JKbgnBcJapVxOOxh z3mz_NUZl;iI8fhXVn`q}s(_su(3cv&UzT+@j*}pV{n^5H`hoUl0^^*?f0nL}_pn%Ffju{ESFctsLwN!tJR0=r!_ zYg|v>&UG0aTAMgjRrKBNf1Le!Je2JlKa7uk$u>30zRVzlWSA^jBMe5Eu_bqQ8MmE~ zETe2q6UI_P)-je+DOr*d#+G%cWGjSJl&$PNr|!@9_j|pbzn_{ZVm+=0b;Ajm6;1&e>98+Yo}ztB?MCW%ZtGKXxmp&kee#N#Ef}Wg7TN z!5j-^$}qh3pz(9TW)UQNtY1c!2p+3_x?5SnQwS$(YtDv)ITQ+9fqF;k|D!z%!^V!!GT-nB6wKpr6NrpA%1lX*mh7)yz6r{e^ zz&JQv!cq6yV}O^Q_3nn<56u>Bt5%{EJkrQDtDQd~{dEN8`?6Fwim zm2^68eyW)>03W3#)PX9`_4~hA%|DK)1^EqQI1&hqj46y zvdE{sPFI9i2__b9cbm>Ktx~vq z^t#Dc(QaoLy}uQ#*2lxv+Vf~u{0tIBq120?M%WvBoXo}0C+ zMf^SKDeVdxd%bo1!Plqq59wU;5O7$`#GCOsM(167Xew$O1(J(2zYp?D%Qg(st7SSW zW0w8m@e5Dli?zkvpte%&`-mfZKh4QM4SwLEb$0%*lut+Kt=~Lqt-!Pv#23XfF^GCt zWoN{0JHX8`)fxKDD(A}3r7OlKTb0!|e0g8RE&o(ejpKg&Aicu_lU*qMCROdnM>G@r zKO8wjPdZdc8i@nVT2I@31-TH!Nb9Tk8M2c9k+TssTRjCJk9QESVllS$yuE!?HDY7^$b&R!SH+wyAlVc^IVJ- zaFfhcm{?a^S1*^KN^6s0_MP49T`iTe%vxR_Qid?vc#b&9i1*E>{B4ur|u$7Z#zjIRp;BUb?=2$S>1<7_TvEi(x%W-n*lI<@G@djK?b6N15Vi^Ty zmCy}z>VwL|-?>AAOicA!Y;VpfVUM`s*P>r(aE#z8xW~n1Mt7G_P{5{lP7sfI$jxVO zJ_mf<)xYC=)pV}O+pe{~hcD>DmTmA{*=Pb;LgTp27XqWRKb*IB5|ug`Q^7QIDo)d+ z?}djCj|tcMk8??;w>-lX9+vWa-|~xCWR!$w_8P9_SFJ{bm_!YJ_H#@8G^^{wv`q15 zjV_M$Gvb-BqufYuR$N&WUV&LM(pTFqrt^t^>g}Wa){ifY2+(q@TACY6QuwDK#l`7Q?TjtLDjjS>9Obq74yF~BH zAJ$wyU5&yiOV_{AldM4?SRTz{J{`J_SXORd>K5AF842kSZk*7mci`BJTqrMp#>s|X zcqo{{*Xc{CrFsYM?sqk%X3rB|Z?G#K~u}u61 zdNRRWLbDjrtKhJpY@$*$3(Gj?bVkl%sYA74z7z#%>p%1?jYUl4Un-Zft6_=*&x@Z| zXVUqCciI}ZJRf1fMw)u3c=+0x4Wsxv$4a4BT;USQ%YF;!Cw>O6B2NdMTSj3yk~|s~ z^z}a;e-&#fC51sH8;ecXt?Us{O{m=_p>=Tuw-=>ykbm;!A&%?3mz-V)%2_XY^c|Erv0{dk5FLLX$Ye4(2$owXv$H8?lh_ z;-gQJ271)vU#wYat&X@7&uH0<5!upjV2q_AX0>QtUw1}TldWD*Q*|r9=6TD9s3pFR zjT12$>rw0Gh_I+L#Fy0qr->Yu7%V&wRqE6H0ji;pnR>v?(O z#;1HS2d6HF%)Ve~T?gxBvG-RFouI!6Ijq{l=i}-i&o!Iodt>_bVjA}CEaz-$Z0!|s z&B>+nRuc)sIYR80LzZS2Y;8}q>@HO&|R-ZbX&hqTV>o;y` zR(|i2De>utuI3sEAcO`+hQi>SV%>7*UpyO##Yk)@1t)n8^xV&Gt?Z_{!dy3WYv||6 zSD0M1blW!cZ%L=T+i~X<{SA`a-5CG&!z$^xwxPdlafbi+ujF4s%gsVJ56a* zFeX-nQg1?g{NpkX8U84Tdgxa06)z+8kaDREee*j#4x&Ak(VyX$16`_>z9qN!Tm6QG zpnez#^b`e*&qse$ci#F($5Xlj-EyPdoF7lh5v4G;v z%U-cQ1k(>Y z>8&*BYs8F5USl$VNb%2^T8kd`v+ggwX41lL)leS0>d_#y+dC^2UTnm5q}Rb!bG*K2 zX?=U}({sflrRwZ4ZL#eQDr=HD0(oy#b4&~I?#>(TV~-TfmQ4YBqzJX|2=J xA$ zqbJO2i>EFW2lN^6IS!rIb03e_`K`~m`@&3Q2Qu!EuDj@_J(;bsRaGd@vJP87oodol zGECiVPu5FDt`F!gu8Nf|nI1O#9hcy-`->`{p}zS$5kVLT zEYl_``eiMJpA~Ky@0U+gZn_|DGVv3&98;N~Em*I97M|24q}eG$G4HXvt8po4Rwot>p=KH2jOt&WzP?Nly^0JG zjky1d+QW&t73=nT9sPE4Zf|zDVsXiUYaVV|n|vi_H<5#7WQ6|W!+hF3+$SIpFyl5C*C73#MMl4*5+Ise@=)Cwj zcJ&nlZfJ?mhuF*AqV=I015QVRb@|2tcBishgl|lU z8)l0JiK2^;9rKsPO?rbZJue%ihHFeMw@I2~7rcK~-R<}}0XXFM8?xQxV1+~|MW2Ewh%BQI%Q4``t1_OoopkyZjg)h5NLjlk3e$}j9UYBz zRa-`K@(xP&Qx#16NLDkSVuGzzRf%a$o3{SPbRfDhUk7pR!DsO6oz+r`DZd{%TWCx^{GZx5?q zYgSvr2wU_V{}Br#X9yo+cGOTx_#_#<{dP#m?LF@@GEY62%T{%^hX5ze`wJGiB&|__{v%WLBeX0H##Hr1cQ=SOJQNy;Z z;-dV^o^b7s&KOUqSH+5(zG_xAG33bE^pZsSG1Y&wXSX-#NpK69OZ!HbP9=Z8>2s;Y zUF7VqnM6-~&HIR-9b7V(X_eM_22;D~uhocGZ6ANGI&S>|*R0tA7BjNTV85lOglGN^ zLVYur!99FvmZ=?4GmZIGt}?YG#nFqb(lGF8xrLY<_%aqM0nw!S+*^9p~UkyatEcLUJldfuM)tEfq7Mrx@Rh8J19RIqy zc@z2a`GNJa+o(pY{Lxd(R!UP@Oz1{$_YWr;{2{3YS@C*?<9dFd5(AFr+*GXO3u^q9 z`s$YXlkX|z){m}ta*XCeZp^UTOG!z`;xj_6SUjy(S-Os&*i*nCzVhG!>^j&?;t%9n zUgIzI1?#BbPKjEgyD_{wp8VvRs;?L2EmutMR?9lA;Lzcgqr&bBl32WxZWW70^mQy7 zTianiBd?f!THzs~C)2j%PwEku#!m?u9o}1T8`q-iW@RbmojDPoy-)^G=#VgmJZx=G zMC1mi2Ts;=JjpJb&gMHK9qX1_c#G{R>W!L|HH3vNPQ-vMdhx?}uw6iDJjsNK?^Dmu zM|~nCdmp3n+*Px}+1?jT4%7>BUrADHa^>HBx@drE%s_j(mT^v;5!)kl*%T`!&wUk@ z;x9AXp~?hKT&?(g3Vo}nkaC5!Zo&hspu|mixAa%tPs=uu$LYCZljjy<)e5Fs^V^R~ zNm2$Ne;~I|Pod+rLLVY7X^k5&Kka;0=7Fd;k@AXtnYBSF;h5JN&;sMNk%%T|DSV~5ogH8IUjn>o@qsJTCG_1-@y64@F z<-9NIlaeLN;BIFTAN3^c(~|j~Hk|dlG_PzhyNy%fqeiDRuU+GKhB~*D`fc?R55E?T z^+BJqjfVgF0Pl9FO^x*jE&JutVu|(5w;4tQ-SJ7}Ve=jw>xRtNL-HIMhZ zq%hKOhhB7m_RknXym~+qHuXd2*Wc9kv6^ba`HhW0x$ALLw{#y!W9_=*U&d6@rO3GtslSG>k z2kWJck{JPV{SOT`)>O?sWL}Q2&Zoji}@t*(dE=qfNN_Ozn{+eh|DHG}8Dx>hq1mi#Kj zEs=@lTPbe_cAt*_HilLi*Il;b`2g`KtX5~qCc&Sou6BNnA$(Gw?|tnczBO=4z;x$2 z+w+>^LS?s}KEB+I5ZY;LmV$hpp0f^W5R2Fzvi#@ujgI*RQB#<;-N2!8Mcnt9T!97B z$Tjnewb!L^sbEF?#?aVmUwmky^ni-gRfKLP@bdQHEsa35FI5b$-r{+ zVA1T-v2kv)fL5QXI@4(O*5LKW1)?xWMVYu+Y~wHeY&bPGKv?~l^J5lpm*QS7VamEH zB0u{KQ7`MAvvvpsQLj}LB_PAqU>vQ#~4Uv#V2hE7Sn`;-(twc#CduAL40J;EA{A z234a9-;`ln?foumA++Z6(kM0z%eG2!Q!r`Q6`897YwUyscQ#|-V{^Z^{9d8jybIXh z+z>ahxC9>6|G81jTzch)kkbA7Y?c){qR@kq0Bdx7EH299o8{Ke$~PLv5xKAj29oZg+;PR-p|16u@gQw=O-hwRNd$UNe%C@FBd)DgC=F@DK zJd$ybr2G%-9=|0n|)Djs7zmfm+1}#qqWv?Nw6?8d3zWH^DVQRd7Bjt zysIDC9Rl;$!Opdo3wo`q;fO8#uLnD@d&MC!dM;MrX#&GrHIH9h%3tr}6%}kAoqN0gwUG~h`nA< zm92C!k1?u8m+pNz^;2#pu8hNM&W@$1c@$B5rfM}fKmY0)e!Qou?9iq4N>&GoYUjN0 z03Pd+mbLz?Qf6$G(!Fi#mjH+ahNe!0XA6I}4Y(`RvYneH*`HOl`+yfE!1imucDC`# zfFM#IIa~07rjQ@Bp!)>^L9rC0X-dJbJEMX%J2(b&C9K(}_!&e*=a!(vgrrO5eb%34 zj}?v?UKrWa+QS|FDC_;Jy&SCMM!llCbc#?7Nu- z4HNnbd`rc)(akC4B`m*36F+&pnlliTyHwxd0A*{l$WMDDMgmzA=O}|=VKE;QeO9g^ zx(0+R-U*(SQW4hI=wT;7;O*8Gu1vx*q}!SdO+gcUg!F(D3H2BnT1}#f0Y*!r z%=w8YmhdVc)IZ41mea-ppennPdC$%!9Fy0J-cI#t=$;%jk?a$ajFxzH&-|$Mi%(J) ziLs3p*~BjC-~n{abEWu}mc`T&PWx~c?i7Bo4MNt#&S~}-UTHo#Sx3RZW*t_(&e^?rhK~$y~ugnmXl zXJ>nmb>HV?7ykPnNI#VTd+Kd~(BxMC#9h+!z2akp$B}HUp5biA-A6r-`=|F|El;Yj zH|BYZb|uM1HdPxhn?cu^`)WT(*BhVE1 zZYghPY_l%{@#Kof@t`kk7H}z>wQQjYy^B7NdF{u9e;$jFkzBMK7kMKb(lt6xmp|Mk z{?SLW^^7O}fmc_-{c8h(>{}`|$`GN~3;tR* z;ZH4I_zSm!;${+%ozWA!t}$ab+H2zL%#Iz`9-M_SiI_j(?d{9=^r+hNHG1J?A>}c# zH{YjNRiTgU*1DwYlwEI8OX=6NYy^ui1o$)+Iv=)8QDA%bRGCEk&o_b_=G7*vuO$iG zdd?avxTji78axZ&@pI>`@=(?54RcLbao_g_BU=n#MXoDZ!kHS_UA0r!A3yCnJ(nKc z`-#7Ej3_dG`%F{#1RtL&a_M-FP-p+L1^i`FiB_UukFi{< z$pzwU`Kd#FFo#22=}%8WBxWCRG-JNpS4fEIiITWWL&v|9u1#dg7Qdxpz<4-?L1^DO zF}NpRMse(LPp7G64>;QxApU&d_6qO}4GZyd^FR0$=1DgnxG#We&zf+bwDpP%<39QKx`zLIH~QbZ8TUU2?fGAaHPGOa2mL+h z5dN}z*p(o!{}lvp0iXjxfqnT6 z)+^u=xT-2j>;H8Be?s{)`sWiwz}&>l1j57&fiQu8kU!%PBM2uuI|n-(CkF=y7Z)ct z4~&=R&>>5>jYdlz*V5Hg)lpVMs~%j0 ziHnPi=Maw&FRzfQw3xK&|LfPE76_D+Ifg}wg$WK}hBC20nf|mxBtbjbnEt*X|9dep zv#_$Ub8vET9|8rQ2|$>cSXh`@S=iWESwZO-@H>PR%63>9b%yJ0PZ1lr6< zc#Lni2Pcj4u7E(RJZxgDLP{UDw7x40YgND@8>*yBRFPC$D&Bm)lDplJrka~W#&cH5 zBvh-U`i5xFl)@_y4?$rJ1w$m~eVG)Jn;?<~#Tub3(W}S?n)JO&D!z-B>n>!9PRb|H za*`;Pr=)3610Hn3f5(l{q>NlbyfBQRS;-Ip>sPJ546qJH76_iDj+&v?y@IsOiHba#SJ(B#B zMwzR2GkK|*?ONAE@kQt^V^K|ksX1x!E%Aw5^lF5Un;>b*QeP6@I@|pC7GG@5qb&Ri* zx09v8V5snWmNZN&l8Wy`zml)6}L^(oPZo*WA1Re26&&qLxo`p zwmH9OGeiU(O}YZaouHZvM3aCQ7$cPDy1`lqFa~lHY}SBK#2D+4%-s>_-Uzz634!|e zT+lRfMOX#!Zi0X_+65_2MS|fNWzM6!QpxJq#P0fM)$|0Q0W%+utWl>-H|R0 zqzz6uYb-6aQOOA#K7$Cx0!_F=<4HTHuZZLhhi3{ZYtbm!Jm^fOP&vhn|5T4zgvY+cZP z%$fB*^h1tQXPw$kyM!CwW)_Z#mwFlPx)I zs9;&kK+?<(eltXt`DPx011X!E`UXtAm9tDqW8@fo!(~(9l_*v5T84r$I7_w#0)Twt{EEILQnTQYMf-Qvx0{ zLsi7s=cHhOLZD!+NK*S*8&WJ1H5Cs9by4Zk!oYl>W_46>9(Vf%0;ixfO{GuOk%3I2 z9$3=r)qlo8v1dK; zK?;Uy!0Mqhpnt|1FlS5Xkkz z^pLuc2{OU+@=SpVfwSr?iip#ArbEUD(^aD!y^&nyGnue%VKOO57*;MclNF3m6}^jdze#H2MUN!l!)Gc% zr!oqR<&Y_EwA`$CEWB!_MqH>*AC`cZAy?3#cnLy-Ue+-4VI=jcEghNQUa9INeIRE0 z^lH2AnghJWA*ptp3Dsy|He?dj2mvpJ)8OgVC;_M?oTivdFuxCE-X$gmO750{Dy@LIAsWLoxQ( zFe|A*0S%F;K9nk4ejMk1tG2aW#&d{F%JbI{Dw%daQJYg4yo5!-=x#Z*>`IiS;&lVy zcM@!qN!b5d1*~8Gh^SfWX?)GMGq83;Q#5cBm{N(iXC0H$N!fQ&!7w+TI8YFbv4(gk z_I8p8hk3aCp-h7l1Es13>z;U|adcu@ZNrZ{QzU=klGnj)-_F@_G|;@sLv9nnX@hBH z8U(#{@645wUc+p+5fOj0WQ6w9O$J0GdJ86Z6hZBCQvcGX@b*ya8}QH@x7iJvitBHx z4*o@5M=*BLHenf-MB+fVOViQAU7Z(^Nx4Ua2*^_RsG?A=o#7Xf= z8eGgfosRO z(uv6R1n^KCIxQ*|$w_AxCgY&FS7`YM;FN&xQxiN}lXOtnP+pn>J@Z$6Qo80dg)1@C z4te&J7CJj{`v=bNbS{B3i=kzgi<~2rIJP2h0f-T_mwlpRvjmKq#Hv;%4G6%}23T#+ zkWg@jT6Vt;TX90-Gvurh0CZ#Y$aqH12$P0Ucj4lZrz$W$?Hq7k@hKRYRT&$zP94xya*dJ zW9=isys}6tO->puS;kQ&7y1p$N-Zmi=hYE8H!^EkW$5mn3Hyp9PT=sB@?~CV^Ba zS-=JQTmzt{Lo(E?IsymIGz-u$PiR3_PI<18t#kt3e7H|e9tu5I+nTKf%>1FEfFO-R z$_FPp$hFG=s0K0~h8{(ta8Tgu0C3tLD1fW+feiznIC0iErn2DxR{u65*B6H@^ErY5 zV$`uiWeLBF;H1swX2Noc?BR4)+XMgvDZzcxpgs&Z zGL8Yya1xx{53VHbjPs|#Pt-DA-LeFelcWg2gMwlsbV5VPOpUOlx#cruWRnkj$_mC7 z-@|}z00L+B{=i-IsbacjK#OUE(mAw|1YE3bYn%M9@>#UsfE*F?`?-9{OWW<1lG_a6fVLhzO-45K^U=olYK(%{xw+3nl62NZ*4?tr)h9+I>eFx~U08YdiC*mrH zOe(|>N-%vYKqQ=1vVisiW|?EEtB@(siYx~zPJng6fn`xF6>aazgk`NB0I-)(pEfKD z!T|axD|ayBin(rrC}4YyGO`5f)e$NVY8UL7QjLCi0N8kDoxcJMHf}!{{~Unc08x<% zwp5(^U3F<7$LvHkSi6a_v_!TvpsXmUAZ@8Kg+?)eLM`c*B*WVFAud>+??1H(fCASs z+{@_bR3By~a8W=5@2bNUkm_qV_jJI4B#tw;wg1QfCab%>uMY_R#u!F zsN^vPT`?+PATjw)>@j155_?;$ao1}d2%KxU70Wo z`5a!{MSvb6@Eezb{BM?>f`kCf*T^!$P>0CVg?n%263G*IOf^RmykTJQt;3BwQ_sl5 zYQZ(mBtj*xdhJp%XqJ)!xc326LZP&oGQgfIC|Sce_gsfu zS0Fq(qUsR{>C2y`aB8dw#K^Cqx#cLi1L z;UmChWBLxlpKb&&IRF#7mBr9mMh2kCfJuunpg5q;-0hJ-nqmz193Gu0fXA8sas%dU z?n=Ph?-)>}tQ7^mmwH){%$1KKfC&GZGy+Z{`vA33YX2Lv940yAfPLVGNUX=5P!?teTxRD=;(6$>uSQry8xj&p zbkDe*k;4VrRsrOWA4`B&18-nx~*AOW%q3sc8&9?f#{ zf+;H|x>Z;llfeUHlI1fbEa9s4Q3N>mvki&WG#+DYmg+-?q!H-MAbfo3xgD%#k&7~s zhx7?k=`ctw&Sr|pjWRv~Q71XukRayf;YgIN#eu}EnMf#O7^k@*a>SYibT~!`h zWy{IakAX_f0F(ucMI7R#!@k=b7-VQJKkPfsJsy<=D5-|TNV~=}Sp67rTr4r;1L1*8QZGp;x35nGI95IAULXm=nsBmB% zLNkSRWb324cVITg(cSCC2hwQK{~u_U5RbrcLk3*NCt#2)_uHA^Z_+&y)>;vm7F}-a zeu_CV*yI3RffcfrkmLoja5H2ui{pNeq&^h%xCdo5PFEfS9=8S8Yw=LGaju3eVES!sU2kb)hP#Q^#Pm2n6L(fWU6v1gcj z1cXcm4-5csqMQik5l|@5uW*JRdYM%Q=S~ByI0z7POrIS%z-3lVD%~7_SOip1Lr{GT zYZC;xiG!dzvRt@R80>_96h3aLhzC9pj8XUwK=iud@Zp9O5+E;mKGgBQ_ESRhbri{5M1&ZVAMl|6oTE=Fl&4veV_rDBQH-^`*)V z)LMf|kLk{v)xw?m=8vuWvQJjL6Z!7DKamMA*9qaz)wsxA(D@a4{iaEWJpF{Sd_&R< z8DC?3jH#g%SSGIFfY_LtyB~qM-zGr<`5Kk@kW&68CUQyqg`=vb;rJQNevUx=s2!PEl~Zb6>a%39A>{-V z3W#OhwP!$6gTbdKx3ZQM4?@^-B+0PkUiX|L#jLIz*ewWVyW4+Nk< zftZ!7O$DW$3>(>RlINd&6e2kBygd8zGQB^@J>JB2V=ga#`#n^He&<-mn}3Vj%Z5DI z&v)x1J^Baro9$m*cJs^Kd)rR@@nIn~c4h0|H(hNk7iDTTj^k|3y$jbZBbEypHwREY z7T!GlazCt3JFrMKZpE&;JlZa4W=1ox_NhR)?V40xkg(>QBmcrh`$@u3-Fy3W*EpwO zx}DXGT;yp>ZN#EIPUulF@GUh`y{R#lCbsYE?oL+&FE~R36FT==(tx3;11`rJNo}d* z1}y7wFFt&P>Ol$s59l7aM3^<_R?BeKe60)@s~cksaqRbC;O?4DOdZSbA4%oC;Lf-hVf`zJ@65GuqdYB}zP7SrO!^POSd~mLGO?27=s$)PqZ?Z_+=>YW{&F#Pc)8T=wON+7W># zgnMenG~7(gwz(Ig`9lSNB)uGlW^n~7Pf)kk5WPPVT&;QxTE@p};-p zqObjdGR0pboFT9*e>r_aZ?zp{ zQ#$`kHx9H(><=UL1snQ>tBFucK$hbyK`f=gT@ygZD#?Bid+AOIbtg94mUy@>KT)F*Q^M{Mt z=&r|mzsvU=Y*XJ@Wz5>3u3WPVsi;Zfu43U+ncQ!-^yN>BmDTLOSDLotAq__L3pP6A zdkm+s(4%|fvyVvQWiL|;e|=v7sg9}`(P1k=eo9gH`rigY-|d{+pND*pkp^MFE^`x{{!zmIz8SC-!8c#`=g25u~;IR9$Xx!}~m>BL9-Or}_U zp=dC)XM9W<{U?8x_n-7aECwJoAKlHH`FLw0<5csj<*~N3J$9f+hc0z;?0YQhFHK4A z%cX`%beIn3<0BvCq|pO`YzOI@+h_Q};M)|E^fUsuavuJHu#$mfub!3YxHP;aQF6_+ zI{JC6?B?6&o2ffLNA-U>1@0=v2^Cn~IHk0M0m`oLbA1%EokrfEb~Z=X#UdhGK=Rwp z%D9h1z;=hN+D{6F0h7D~c13&janDz`qB&upvhvw#Q{)=G?+-*x`GNk@mtoV?U1u!` z*|)DmORdI=ejU-Pq{|JvC}`Y@23fqXwm*NpeNk6q|A-{m+jy}10UzoQMDj`8Eq%X( zsvlR2{yT@7`2&HKPp-&4)?fN%wNeJEY@=%jMO)mbdO(9r4oJ8?W8EJci?4Tj6}m-qWkH03|-O{Yia@6E1CHeKYLQe07FX=-x`i1`t)yZfQv0{!ZvOk3)YT!LNQ~ntiU{U5nfK{Jz`eC*|M5 zb&-)*Kb66W>EFh>cg9{8#n$8ER%~Nj^{RFXopN0Eue{n@CD4%i3m_57N4b~g`{Qpf z{P4Zg`3+=%+qL6xd<%pit=JnC1IBu@D$a5B*xL(VV=jFLkgfA^e-qutbAA6j_!2F$ zrLl7XufGO;0U#t9h~i&gjiol1etyDWm1!=YOc{uv6{z6va@9syL z#!DY}_a+whn>03uomRg!_b%*@)o%`OzfX@b|7YL? z{ywONm);RKI}^CG`El0j+siW>$0oL$x--pW?m?G>&bh%i^4BYQMxz};f9(4QO+vE|49_=};-n7j|>T6vK$;#4iA19BENUp_l zyzNjQm_$z|GhX5l-!R8Wdd+*lWG=m$-{rs0{b?1oK|B0v%4+AkLk~Spgm5m(X5$ZJ z`R0!P&d2x-8&C0nICgpCI<9qbsG8_KIl1}1Cy%2{*Cuo@);0!UqWn7+Q108u!%Ooi z&t{GD2o!j#ozyim5Awp^?cMubIcxS(uk!*pANakJn;WN*l$DXVFH^HPV}M$U|jv1iGR7u(Z;VTPf3=q4lO7@AUjh*RP<*Ux;c9O7Bcjau z2_MRK<(ZIaU6q=hWs=n@2)mgAl*J})R$Lw;7GBzXQkO1fjWYjqDX7>fGcd?le)iEK zh66lT7e-sAcB7p=%UNN4N3v;ywc38J_MWd7;Ll$211mSLC0MP8$41X5-Z` zPUqBG?>v$`(+7zx22;_BS)mg|MZTp}2RDe8L#iM@P&KJ26=aY+n3x@fC4>2)bP|hA zh`fvl3UfwmkokJ*dQ}2%76al=U*9Y zMp9Ni(fkF3Cg~())~k^q`UNo%lf;h(Z!4ppWoidnDfS?SPm-TQFIT&@DSe%qO6C+|M=zbgfn zGW*}TSpLm=>g1Sz@e7-^SZQ6Kn>pbV8#k=wy zbSL#(+$XvCOInLO$zPIg7R`BuVvNtLYv`{nY3%Kw<1Sq+xP6CX@km^vc5%o{IyThU zS>&IsMXyjmZZGU2XN}*AW)BDQGbRBUzFG8C-aTK@ATr{-W(Oq}6gkI;TREk$>Efav zadsqauMxm*`%u`?81i#P=SY0J{yNJDNZZM0wJ%qoBRylNisrQjbTJ>0>0@WS-9PUP z)Eo726a@tYk(QE{?vj|IFnSUK(gFgb zyJ08-lA{}jba!_TBu00qNOz6--rxJa|1oy=?C!bGJ@-7{^Eu}T>$FenHr=Z=+@5^Q zgG-E59DSpWTpVB>mCiZO$$wrgu*J~) zfG+YhXNZRK8}>_MS>0cxOZvWm9Ona9@7rYiAJMPCGRz1IJvsE~W4E3|c07CE*7)E5 z=+RJc2mT^@{vwlZ`Vdx1aY|@;C254a9E;OhjUj!EyeAL=+ymgLjQ%9%gN11SX$vgG zXa9K&?!MoF;S$Hu=c+F``+fh7dI**k&k;63sP1$FfUjVdS_na6WtSl|6yUl?!Lah zCc=dggVxMI*G;;Y^fkfGU3DaKedX@JaKzUCu-+4_T;sH(4PIXpo4>jK4(l3*T{n4M z5Y{mL`3Yj_XLKd%Bd=DqgH z590rBl8AAMJ>B3nQWsr=!2b@zw)T4Yo=P&x`ValJ+C1AmjxY)EI{_~j540w42|FjB zY(M(h-i2Up#;wg?yt#s&XOo>%pxK|{W4@2_{3%ch;{F1-wMf9g$aTQTXQZjte~hlx z+g|>|LX<)LwphMnL`cNDrJud|RVcGug zdIXx~9oM7J0Sk0=3p*igjl09G2gY3ibnKt>IB1?X1BLW=9!3~O__anGIp#9^AJ#X( zXYHA|6qdl9W_L#uW$?e1@zx&gUcCwozJCO{Z{Y>b4?_Q>g#)yeegjx21f~yes!`Ea zt<-qdp0FI}2_A0?p?@mL33z$-z;Z7y$JENVKk*S-;ugjR&I@&|?oz~i?VuA1vA#!+ z`0076ZPn-c(gL#^&}|xP?FL$D#=g}E?y|?{T@FM}UxE5H@5FC>x4fhD1;4m)7-BXVDFF9W~+FwMR5$|&m$vA5(ozVQ9 z2&jeyJMX+E4f_e~GjFEU`3^zz!b{gH@)IuHT3;vD-oKO|q?`MUG;D_JdutOMy3@zu z{Z5yaiYR&?>m(p3Uz~tF`3=IB2a3lNaI$_+QF)PUfZzy!eDn7FApqJ`KKh`R_V}YH zO-4G(OQ!aK?bq3fF5^RXyvf;|g-;HZ$hRAlxf%qzLG^6zqiaGPNS} zDs2r}YvwYttQ;m;v84K3KJ8GuXxYm_e-WYv#Io?Cs@9?2#j{y2yoB2g6vB$;L)t4$ zEgqq)hsa-F`e<_$f2E5U$J6pmVf%-}YwgeGuERdGw}|TcSc9TjXP%TiNVK38ACZdi z5RUU+u)k_3FP4OIkBuIw>z^@IBcaqxbSzzYyBAvEgO6*r~qlDJf&Zf~5il9>aB~5+etr3xptAO&InUnH| zlfu3#fT6htmu+)@wFljP50%W<0=aB3;ktWWl>gYvhnWsuer>#S^5^+Hdt<>8x=%GP zc8+zT_T+R>B}qvK#5Bey)y{!SD+WmRrI8N9Y<4vbfXaW8eU{tHf<+&aeFmUJAQJF~ zEGdvvs`iQZf?9=Vs?6Tbr2Z$3eSFt6W!(v z>J7XH^}_09&y3bg)#rslYG$q7a9L`?wj<2gu71ya{QhWvwUwww*X&}I;t1sqP#P%H zPctoMRPjLnRt7@t;Us3ag_#cAsEKAu&mvgE_g!fc$DMT4Pbfc0RjREA&DDtla><6x zYy{Q?^9vXeifsLIK4&^Xn@{d8tBHNPuj9*ik*#}cF@$OyCp9Yg z*U)#b(ZnY|e)It3VHQ8)ZSj~dS~l^!R2bBvzN zYKe;v^!uL_;cvbWz{<~^c!%J806*DCN%XelWU4fx+l#g&Z+hFFCW|GJ_jX~42t}^mxANZFJ~G!m9a#6>7Z%54_=>l2yGaJ3B=G!sxOdm&DPF_p2XVt zd(%iB@Yu>90Aq$BblfLT#9to%I`orJ)9VhLT zU|C*CiN%0ZoYI){c4*Ye-! z+sA|?;8!Je^yr{LrTfObitcMu6~Gd?Iy%#1CWV83rDhIuRVLjAKoS^+Mk1xOyF&?& z2>X9UDgAE&w z-;zy5;iB2P@@4Pkro5rtZsJ{s9k5TH9K&!y$>;Sy{w1-}b_SoO2GvC<2UNZir@D~Z z>%=zCQH~$E#)MxpriL9VoZGhd!to$nmRHUTiDKN6k7Zyj*0o>-rq9FyOa#U3{Y*2$ zC(BoIxWE2Ba|a3*Qb@`YhR#DzNdt+It}hN^glLRiHIRPU%Y{b8B+?-{y`J{n?I97x z3ipZSke5r3EFuqO3RsX5UJs%g#gA%c@_V_Qdt!R_m{os8k8bJE{UDyyV0L9;&nCr6 zy82JkPRirPiI(?r5dx@=j4Vn*Fb8<@qb!JYUYPv5>2gHdIuOFpnACNnu3uE!f*;`l z;Zd*9!HT7TyFGc0`%2Z!v4YQT)DBO-FEB{Q;s9>j3xzkGwD&z z`~)N73-tHL>}@{n%h-Ri5^Y6owYj`WhIv*kQZZh_YzR^G1#IMdO)7L=D$vs%TWIBM z3P`t$2&>mJy%0JD{-2X5W3ap*Y+p-R=qNYe9Q|9ip6tEm2~Yak+>nMW%h2Cs+fY1S zS}D}^)RN4W4WmP^mT@Y^ew6nPOdsgzv!~lTon73G39+y<FCjp0XTp_(X@T^HN#e`fe<|cfWLIy3aLyN*ZIteI_Dpf!$4VKozmq-B*{EFHY56l2UEqso<8r-b0AHWi#xp% zoYcU|0B<}9$l#P@j)`bl*|s(0ZoZ*O%IB3-*NlQMzrJwAxXwFD5nud)N2(Iy0tv`j zd49BQHC@j0t!5~4&ZQ3oHC-)}n;HOX$NE!o+_o?j2yQyr$Y`b3<;_F!CIIaNxPZ#Q z%k&=Ex`=q5d66;49G|3YwiGW7?eOzLidv77TTbICq{3VnQ)H&LzAlpk8dO+{P_WV6xklgZup?97^NEc=T<`SlTKE~96|1#>!| zSEviiSx!won7Do};cP6hrmll%<~-zkE>r*NafR#Mk-!9XGk_JF;GQr;XvdJ6~`9J*)rdQh)D{?`zB|2%dL+;@d6VeHh-x z6Z2n>hZ-VZ|I*Y^SNR=*HuHu2c}yWQ^Xup~Kf*^myE*r;+$M(c&~mE2or!5n|Ds&_ z1l5Be`g{AJvWjGp%8WY1pNGtAwSLKT^tpieuA#K~a1;Aq0l_`B{@u!lmwO0ZZBxg# z%~@RM4>>qD4&u@m#2lf|!(&I}chxwnVrwqWb0ks`5DgWEP=VnZ_w_UqZ(EI6Wr63~ zl#jt`%6poR_8#S5JG9Pb;BKh8{uJ%sm%4jd-dGtqX&#^Sf->!uD-}d-K6_7fF+&S4 z{bn@?6E{*bO1K_%k^j4rXN7yN;id|xh?nm`k;@8g6iOcEW0q-dv)PzW)ApuC_!rb0 zkDEna=NhwU^V20spQoBdO9*fA#vtox6ND-b9&vYbNuuMkm64DJJv)F>@nLm zE|1912jm-S_>Y{5<-1r2_A=a`i_^cmnHhDPV42Dh2!w-(4u!2FqG;WOQKYVY=s^sF<0G$(>F5KT)!*MLs1Y5PCt=$OCm1p(1x9!>%w z;F$0KF{)$#u`i2SRDe`P?OV_&LIp^7y_ES50PALg0La+?#sl~b;L`-XtpDLa={;b8 z_+ilfZ#+0@;Qzu%y(Y9ZfmZ4@fzUG&K;GK?5O^a%QpbRqDIeA7+9|}7h#mEw;vUTi)8+!ANkIvZl#P5dB3)+qg_}lbYUS&X|GmFta}Gm+6l>xj~sb&aIFQ4F#9@| zIbNbcp&QtxS76u;^?A_8AAK_;>nQ1_MIk!jOj~0v(|Fh8LMFF7q~99!%SF(Ywhaw) zQ+Uhe+fS)>pkJN+nD;A0>CmOCX3@9i*5=cgP?8WQzAfwWv;OFtD+;7PvLO>tymb2P%Ne{heFS;!dX3ri~SFD6$@=CvL3swv1m`C@=1sI!SuO!(3e10=%(6SF`0=k_y?7Txh+*mlA{bgtYiFBJafYg6n+ImPiN^uF>1Wa~hZn7u=M-&eXd z!k2Bn#TzM24VOO=uqq_BrtUL$)f^e{irx*{s>EEF>ZYjKlE`K6TtyHC=~fz;s!h;D z#lmakh_1eM`z4`^C!Gx+JC@uB_h_Z`))?x4Y&wbf6(o-C`&+tBPib8@zcSBsfX$DW+(W9;!-;EW!T6~G;d54Gi4m)_}(H})8*iy1ft`otd z+`#SL7Rq9b_gW(T?0$+SXrqh*lV0nPE0t^y=v)TZ)5o>^IbSQcO|b zKsDE*S;qHY*p7SO_B4JTj7CfddUH%vy#;bMu;3Y+2;G2KRo~{H${Wim&(yV^8b+A#$m{7q-cHJk3OtnumGV%a5VV_#If>O0 zs+{uHQ;6g(ev06H6y599(T5q9dFF}IaUE_BuZLNkRfd5Y+7F?PxqEM4lnmhrE|9s) zPumMgX|1SFY@L=FGiQ@6g@+SIt_FN=9Ak|NKASCHL z=)aZg9Pj)*Um=4c>?c@wzSYS`HXDlexmBQLeZmPjr`rgwt8TeXcnNNxEO8jKUv)lX zeCkNV`=jyP;)md7lUSM=Q6jhys!`gijuBt;U!@Ze;LNUVs%x#+ z3;VpDL4H>yw-$$C%v<~7sGwm>Y75wChHNsC*BwCe2p)$Gk(EkT7(nF$31c;C6&ejr zQR}f}?0{9>W{V3WHTf^UW-&tz0E^dOFQ~44Nyay@0KkYM^CJJ*1wwXytw(+rhM3bC#fpsE-K2L9 z?R|?BkmIohF^!6;cKtycJNygIciT6|;6+;4LkG|sB}-+io6P|xQTnKj<6Zs3Z{?vz>dpaYf@Wzf=#~4-X}rNmF;ovtj@-t2PW1*={h6XMQ$Up zA2p*Dw;=}xz@_)BtoQfaEYni>+$RT{{bb)W-GmVI26|?pc9G{rZ)p7|gu*O!X!r1L z{9zvVN0a^5ipYSc|;}5`ZD3~}8a*(w;r;uE__g-oT z&+k(?4ssT>A6I8|#qO8?0lkqb@eZy^G~Hmjl9lxRYwlE23i>$9wYKy2i>{%P)D-ii`KvT`EhGMY)PC-BU6L6(0kjhLj0{iS>4IWrqR_D z^cGPEcjGjhBrEWd%2aR0)P??wQt&z|HD|foI)9Mtjy#e=;lT%k>=$O=Vn=``tYxb9 zFRgD&YuvQ8`B7HkEY4i-xk#8dBUh#y-+4c`XZ_1KlD|236}u4SB#2U#pqsq0BYwqh z9`Ue&daEJTneUvE5$EduneyZYNX zVN9K_uYP()2fBS3*_41u+6r5AYjiWwvYy^D5gav9$f4pHC8f$R}P?EjCKmFSqCC^bx8y2N|=-h6J^EiFil@)3M||2ZCcMDSkqIevbGB><5Lp zFiniri=#NOXyS~7MIcYmU(S!sH4AK~tagM{sZc2i_kx`|fj4&6mQOqI=?*a~fx3uyq%2%|+k0wL zU3C@cDyIJK%#|9T&*ajFY-sfl@5ehA+r`P}GVEv5Q3$K5iMRSfj*kxW9ak}~pCfh7 zGQ6hp9Z1Bo!pAN$ABK;cB#PxbK2gCMC&_qCgNMTd6mwKz$QkCN5W-eS0g7j`3&qhv z+ZC~NAzh(@Zv%1}*g^@RVtEg@qf#KbELd-1h1AY7u~H<)El3+5jXvB76-)uDQ(~=J z{(%aHDvx4^Y{v=(=TedONF!**-!Tplz-dCA%EC3v*EzU=NbX}qo$}W+<+88LhrT@t zDE*9^vaqf;;Ymq$KemeKmIQqzTP70H;JY3|%m!jhKkqdzY1&7+R#0E5xq*$+SF##6 zF1OOw@#6mvrj6F&CzEhOffE_%=OQCB@-UJa+wu7QW2%>E%y zNHI^{9wy)kf?I2s&Dz3owh@`@5XF*_YHO!$t%~^)RrjR- ztsIV*d!KJnOZ&?f2$Mi}{_e>I6JN8jmxa5KK9SF&<&q&gE|PPqGV$Pza7m)4K?FgM z9QedST#7$j^G$nStF9jD8^d&4C_7|%fd9-b?f9daBjz~k4-t?pWS{G}One=d;}NS; z13hG{nq-aYyoBa_5|Af-p^>3^i5hQeZZ5DX*=cU@jp~SCjtDf z*e_(C?Z-`{R|EX>wJ$Y*ORfUso>Bv^I# z$J&o9-0N@73sbLLXc%v58DKmL%gc;Gs}($NXfC%D4tMISJ9cTdWF%y6Yvjl<&Cm(lf|mj18O3wHGuH<`JgWt>6&qUsQaAn8 zlvArg@^pGUvWXmS@u-p$F#sxl*`lhGG4+xCvx%wzVptn7h1}9GoB2SDQvdJ+{!V^~ zjPjNplq;BG)AFAETn-~wHvc>$e+cn{&{Y^{6j7~>wPf9JvgRqwy^rVrWM@xboKm>k zAA=`FV{o5M9X_2HJR{9Gfm^!C1jK>4&8PE&!Dvk{qkF%1|FE*jn^idw_|ikZ%`1p? z&`7esFO-?S1$G5L%OM26`()RrP?jlkJK?7b9mv%*r^1TfXOCHK!W~pNy;bYXIe44; z(A)7w!x`hKR1=G2jq3q_vj1HyiHzLmn#;Zyz!yO`H*##|*UoeU2|mydhwcX*P*qsBS;n!F*t~BE zNtc{U_ztD<7m9hW?#ab*7-%BUlcDliRQeqqB;f~v85`4C(KcY;WY~q0NLtVL+hMWj zw^*<@-*%$=Xv2J=4MM3ZLNT+-GSE}S(+cxGx4<=-V;o#X^LddNv7+Z&1an=nksdj_ zGL`zP;}$7chUdvS!wj2aU+|CQpihycKSm#J`vYl%NxitVe^{g7Ui@$UC!7^Maxxzc zQ)R`r0!DdWLjmQ5r9Y;i^!7o{Fpt#klT2)(nxyU6Tr3_-nsLI{M{jW)l~R<)KQhcw zW{1A}&xG?poqI)3a`LbXa#%yL98jg=gnExFI3&+xA98zIQaBC9?uNs3Ws@fY&D9pM zKU(CMvYWg`S!Eb*Fnj8>ojO6hKAF2QIJ8YssxOhIId>Ej50W~>%bIK&XT%|*zR&-) z{v5q=*WVc==ZPykKXCHCH@4H2_I{B<@yk+;`{+9gH1$_ruXl6&Wmj%Rw9>f z?qgQ-OG94SkSaZH8Q)(Jf|6g2154onC>rfzUs)ep>0)P>-*(Ib`=2z=2-k~nMjxCb z=~X$MWv_$ReINgW$4dOCYsEo9!3sC+Vf*=$nJ?kVX_ZT;%T zVMU)Hw3Rp1HBaBSG0v;#HRyMZCiB4M&+|6RO@hmEmN(ORH1(H(%j?dVhH;oR(8Pk~ zi0$5mr3h2bv+k<)*Zg+OAi6A*pxuKCm$bW32M7Bg;RbX*s+|afD(gv5{CTXa)Eg}z zCe@jL0Tb{o+})P%3bJ^c@2^DgQZ`1^gm~=d23|+XnV=};oP4Z&WEJbD#9xL_&`@t8 zzRP8Bfw~`AhwM778lUXYaO4m+_DyKyj_xE+wpq%$dc>6UpL=%ir~2hq@OOeAnyZ;j z?Ux55rEBGUot5iVP~IfY;d1VX@s_O>?kneO%@>PLz^Iw=!Y}B7OH}2bs%wNoBt5@f zOZU9c`xXptu697*OU)vm=@guEk1LB4{h1^PjLGH3Dl=mpO7^k;aL0<{R42oqaJ%%& zhONS7$)#8eI2XkMel{o};`L$WHLa3GX0fjA;>(1+pJrzdX%Q~|ttTj$(eIY=5za=Q zU!zYi(%p4rGF1+y<}D%!pklt!`UvfIhj5<20OXrvGp5 zQ-wM7VAW!$Q{RREe7MQCj)9vbR`4VysbD1AhFO-Ol6kDJLI3_z0wzCmER665B}{8o z5Gp258k^$(Tt_d>oR)+Q%YI2xcu^U-OdU406Hz3Y1d{}U=?_a78k-}7HS7y#2L;Rz zA45r;aC~#a1TCn5hV3<%F9gHkx+e#2fAxId&fDPbpMam<77<_?i-@ff6YhLMa>6f9 z7t27p=j>|>-`&NmCKYXUE)OEYS23Rql~Ns)VNicX%>#(AlgIh6M||(p)u7Q@2A7J{Swc9*s7 zM+qK$HgTbSCq$K4v0cO$?)G+jd3)cG|Aa;W*T8V?K4-oI^GGYk2Rx(cv)-d~q9(TpcHTr;}X>DBhLP6sL2R;d1pBVl8H#l10^|rQ#{$c${xX; z9QVwo1t|(YHbr4=mvqwYtQWKIYI83 z+HQ1`NB_V>|6*(nO%;4m78B|dz5ius#2WLQ2;UXc|MT9UALB~eP9({-irBA(Xwa0k z-wC^F{xEy)r8YoTMQH8;{<+`2MsY$7Ha_hsXp}K>>?U{)nWSs0owsZwIfjMUETd;h z9GS?Io~An>(^l#G8Ubp``q5D0Du<#OzjEuyVzIZw7~*bUqmBM=alR4C+HIILnm#Sf z_!mInx9f@6&wI50Q^(6Okb{7CK@i@bC2Y~|GZlMc- zZ_4A{__i%@{GMMc0~BK`!2j1L6d+bt0;#hM;kg5VM{rIvp+vCJDy>Ojk)d#G(Pc)I za?ii(Jab0&xsAX>Wu&)paw_zYGF=t-{>(7G#Wc;!PgvEuHbQR8c=-2$_`rEqXW@1| zJ+2RnH6_X4H!*dB4a1bEwW@07YrQqgZtfRAOT9K3MW6nvQ@|I4H*IIa_ z6<^Xju8@x5+f_NqWz2jzeqUI|aNT5{jyNY!(O$nIJlEmz9A4=rA}ZZ@PkWh7`VO13 z=I1c4Q4?N4CoQ3`l=Ukgsfo}+z2-UW`eRAsf=*`RNJu+G@jL`FWOVgS`_c0pfj%*8 z0?)f_x_GVKmb+kcI3Zt-Tn=G7YQFSsq68gO)IaM*A5%jzan%E-5(T>M!<}%vBFqz{ zQi!zL8Tm&X>v5l&Xd(!olfZ+6TNp=Rc0?M&{%|LBN<+>rTXG8h@~7vzM+Z1S_<)XlDsG9sYU7 zw~%+3e5-M(Mp0{#j=+ZieMIH_;v&{O&w8>?%Rg9B+yS-cCkf-cndzo7JbVNlK=AWC z1i_+3;yvf6-$VH_U^RoS3%5%y+g)P}waOF?$&osRM|3;29Z3BHD=qTlo zpIpG5K8r+6vNH1@mWCjW%TsXGe@9xNnd<3RfChV(K(>R^&fUgoS^EKWWKb{YmYbo8 znYK=lWC4UJ5`RTE4hFpQCtE1=IsMfzcDFm*HS+$0%i)wF3K=qDu#g z+T=bs{t_U;AEm(1j59z0vK;t=#Q1`Qu6g8jO7*kc8>1A;3Cv^Q`277}vxd_KBcvYc ze}Eg}6JVpEur-n;`F|W_g|N8*KT6!lP*2UloT%K6W zO+Fmgfd0lV8-r~alj3~)=7zxPS#6GXB^iBf&vC8r>(H*d@ zn#qShs`p027I-nOUXCtCH?C?&NPs_657D+;Bgh7R!hHnY2*AHVuOv&ZSfpe zB@Id!#5~WZ<;_6@oa5e1cFXK$uS@r`ML~v7Y?3+JA{GrT5(0i4s&OZ~lfY|P=^8+B zFaAhBF&1kEK*$H}w-w+@kAZB~JvImcZ6UM3#Mn#k z-~h3Pe^`mh&Ifj|zC_IP1X?=_WNkapjHo%lkrH_4^{h)LuqJyye$_bRK~SFGt)M@F zObI&=wiX8`xS1zh&TDnHqRL<%V`&)*_kvmh`T^0T!FyE2&-nYK2e6Jh>@gf!(5hK} zS-46*fby2UsH$Q>mG<4#GjIF`UF}bNR~;OD^be~}*-DTVmk)UkFn%+6g13E$os+z% z#=#J42AZXBN1Lq-HGnF~8hfq_}((HPJ_C|Br5ONZP-PziTJ|%+DKD^;7?8PD>dwd`iO>S6Cw1HgufugpM2kpQ&T$@ z8!m7)idBE>`!E8E2u7c2{KN7UrF~FV-}&ZIeInvG>Myo$PuZ{)3l&TAkY75*3PN$c zM>7;sg*LK3li)j!gV^FsRXaF;5;hf~yM*BK@mnstcUxzQ~OGQqQe9 z${k8$hrL%L2Rg|0m}u47;oZYfUb5$^yk^vEMvBfyOj;`S^ikrP^F?ISf8NH%JUGxf zJ;=EIJB3Y|+g>xaVZ7oc)&?bl7^@h4)9n%?cM#R17w~II(i~<$+pz!|!h;eet(fVq zE}0x9`+WVs>h!nP;E)r@^gdbAf%VVM zj)R6z=0BgWG|^4jD}+ZFq5^=s{@%)(Wk2^%ufo8j?u9svrXH;V2X&`xzHC~d5xM`` zAsNXT%~a|$jjc#5olk|<5-ItdW~diEGYq+Z^2vF;MCaMe<=`F%5!yrB#a+zeSg^%< zu6hs?H{rJLXO3D^Pc`393x(GCIbosqmPT(D23#4fV~CCcv&EQV*E$L7u!2;RtQ_aw z@nbJXJoa>FxKm)GfvpHlIHKrppDEkI#}oILk!SgtY_%W|h z8>-~lcjrMX;gQ89|f3r}Zt_6@7k~_X9+f=}F%tw2? z@zI9l%wRh%&m-9L5+#+&4}vfF(}DZv23C&7Y1QlVdzA=2kqa#BW)QJu_-@0y(5_LSk|NQ4b(lU3h<74x{WnrM_gjJQ&8mCq=5ZPeV zU5Fi}9S>wX!J4tpTgkon;r{u7*_LnT?yX5>=Qdx@a4$@2$wqCTx_9@Hw4a;C&P|yA~U0ZN^^DJP!FpFX1$8Cf6%q%Pd6kHS^Gzvxy-`+%XJZu zX8MCqXkTwcekVBucRLA(8l0b$w2>*?1!V`#zt|^Q^50F{@#|gOi)q5>KNzR#9BsfRtAU*?5)He;O1<_9ADH-=JVA{b|HHc7109(EQKF1L z{iN`5^#J>r<_cYOy4Xdv%=bKySu)-8{IW>Jr);o+XL4Rw*>Gdx>Z||lC*y<(w{|;&3 z9p3$<{V84qcl=88?WKGWb>3ks$Ap}{I;*kV8{%*ROmk5ac28Zyw(4m|4z zyTfu5%ke{A%Kr6NhtEH%?4WOCT1>YaHdbn-%{cj=+_diUwRYrzE;B~ee@~3YI`nwM zo{0`lJw?Ww;Fc$CBi@j;3{Yv7TUi43Z;!OwwurcAT4|rj`5#ubhTm|JZGU6kvq*6C z=p9Xpc%pHAGG88ssvxi||Mak>?rOmvzN2tS`>wiKeO8M6@y1y;qF>8kz)vIcF>5cV z`&QFvu3n#F%I?8VoGkWfemZ|s>RsLWjX!cC)CW~jb0{%L+a3>s(&c5nRA!METomz6 zP8E9ga!jXfh9zhsnedsk#jN3xlk$Vkh1dn5)L&`|F~YcwUKW>9LEu|1zghuk-5`9i zRG1xqiIT^b1hgx%dJ<)ZO}gf3mM!huCiPx3#1zkVfxtQ-I*>*KGvZGa+_m_T_PQtrQU{sfHa}4B<4%L0|NI8DBMSS3_NPcAeDgBi z_{PC4IwL5mOqF|o1U9a*vP~MqNf3QGi=1fX)3f+40NGIS5Md%&*k1KBk4JZSgbwHs z)!+F}7fPnGYDByXIh3p`8t>TZ`EYi|#_Y-jB*bz=KUARMp^Z1Y34&k|rzA4Q(2eR5 z`px%hxv zus4w(!ZUzzP-|L>&;ODZ;(5^_c1q*65gZAml^pF zOBkA!^K&P*KEcM=Pni0RE5ULy=X9ZbFo|T}CQUgVQm7j{=0SMSPWbeut=iZzfGd zJ8txLb)|JWmSTM0=A6#Z3P@QtlzcR}KDtivOwcz4lq%X?ZOL4036aR0_3U|fpKn;V z8f2k*ex^;LZ1+)|?;#cS@n2h76v%%Q*duYY4qo2fW^^xx#{3>W+%_KP;+GQzHon}x z;6fCZyRg}N$Fjd#x31wPVj7$TA2#LMd-lCEHm}&|;v*F9EMhlZETGLd&oPBp0V9=g*wWlr#dd5u$>Wu?wg#tf8`@ui&Tq>9obtNlwpIfktyKa z7XU(TjVlvznA6(SfVx%Ac*k_tzQ@9Zc}NCZb*+Abf1tan3{Gt1O2v)HyqUp0$G@Hu zJDsO(2VFbGHO3~w1~HVmMPP;r)^AC^d5&bsI~d*H%P#HLXMG^N?2;R(eNl_f>=m(2 zZ@@n+t~GtceL|VtnX8Jlrw1SNe9h{m``~Ld$PI+Ib0Nf@1FaBw|3+#eV=Kpw=>D|W zK#0Me>>j4;$0p@Dr*H!^O1TS92#6lN?96%c!|BcVPmkZtc_vgHvoS4lCA2=DE3DvyL7acj`K53q%Y67sEIPvBb_!mEMv0wx zTYQSUZN#_^k{IxY5xR=Q?}e?lpO$BI@-BY34L3T`A<-84)~sD+xV;Vne7cttqW~%U z6em5+^nf1_g*>!v+`-R`)&|Ei5(dwlaCISEW@hw#HClXsMGnNzjiJhqy|Rc(y}jr` zra+=?nn{u|>b>spQ*6xVP>Y1N0J^>u6U)KpkxozoO16~A=Y_bNsEJy(xqgsaSYUWI zG5&n?FB^12DWsbItzR6fy8SDy*LqlJpVE6CzK4D;-`SfE)DT{i-tPP9&^A?&|H+Z; zz?Q(JGVblK6sQ0W_8_|WQ- z4or=51TNBit3_`D>*2W4E0`zUoN@muus8n+Jgjw3U|7SKFmhc{iF8Auj*-uC1@|i6 zhV48D&{>k|!Ffm??lsAccJEH4BeF~B2Bz_APlbx{M6Yvt3&JPubxvFo4%Cl0IyjtA zKHh2;G=4IamT`@j>F2st|G9ReR*%0Q&j5Ci`2=@+yA@k2hKd9-p$BS75%QB;`kA|O zPS%T8PQ)fAVfYU#=SS=2(Wd!7yu>!FX-y7&@t!mcd~zP}>pFo(+pwuMwkxnO&9#-*i6f(GkS zF0Gv_yqt#c=NGZ;$ zY3}Uti|Yvp`1F^f{R8KiZK;xoj!ix@^kZ|h^f-E#=c0Z-Ps>)IH|-&DI>?(Hb&Mq6 z2I+q~P$NH5{{ELKUdcFGplpb{+SB_d8`G7#+=@}Imzc*X((o6%AN|b5U%iR4_9|hZ z!Wt{{7sp3t_t(r+{PzxS_nNobnX~!@4R-`Fd<5UU6%el!NMN@e=;9H3I?e@Z{MnB( z$9&&yT<|#s8IZfkM-xHQa8`qGO zj79wkb4L3K0}&M;+CR{YsxzJbG^N>j9l$)gLolm0QITgxV=8_{I>xi=cQqId*NsRY z=ro39U=OX zg4giQP-`-Mf)m*EOPSasgzSzZ4|1@(vM&0(?3YWDVxotD2|1UDB)>PlubC$d<^_M|9h zk`&CfB#jJE54zDGxucymcP2{?nZx#HkV6idKOCw|LGDZu9j7G{==|XRUK0kMyvVqy z?G~N272-3W_}|26@3$$efYKsI5~_U<2p^9o&?**Gy#`&fHJmJT4@cVf2cOBjQy`MH4xv(2zT$bHg8 z&C2!3WRA+*S%^OU`e~dC-KyqqVB6G}scPH*X#mTXQc6Od%hQw!dqNrzQA_os3S0OR z9?aSqIyTqq1`$Zc2pnRkXN%)9|HG0s8cCOwNwb}j0BOF@w{YUEJ_6<8!n%I7n$=Y% z4jMv?>=T(W_zhD7X7aC6z?AdJo%VwljE#k^%+N5}aarwK2hb;;(3wZD&_4UH_cRASD5#A5o9rJRa&C>l}vwuThg@{RRSRT@A2VHO&> zVEotGLibed?jBa&KMYoiNg^p5=kJQ`#GX#${${WfPUvUH!S2dH5JH;A3Z-hPD)~(f z?Y3~o#>HA|I>u3fc5}DjMKG?LJP3Lk`(+bW>MDggF8=~YYX5{R1~fkzw_i@^6%EOO zFpA#1H~UI7G`n}>a$W>fQIR5Rzg-E`=U59_s-Z&wdCj_Mm5sPkH+u%eJ_Q4JL)n|Z zu^zEY4N&F@*=5=rI*&%idmbRM1qiN|-iEU;B(Tvo7DEcTsh0IqNBA54*8@mr4O%QJ z4zzo0`z9uGUuzo{VeMT3RY=2+IP99^u)N(#n_02(ev7(vT@)|t(XXkd2kS~^cH{aH zy#w>28P9TkIWCci2i@b2MtEpj?Or-IzoiEH*=MMxR^Gne2ff^LiFmb3 zZZah{$&)DhB45W8^Y`xZcJm+Bv-erhvAM3Mv=f}D6YJ-zl~P0-3!`S!^(X?>K6SQO zUak(`(Ki*hMDgH><(46puV^vbnc<8K@>u)%VK1KU^>@zRWGxI@^NR;6t@woaz7&V> z%Mi?ga~osRon|Z|6-nlhL85W@N|6iglLIv3;TEnVJ@}z zn0#?@z_Er}@9XIVTZx)6;%@hnsJHl~;>dkuQA#mS`^?4mn=aCcOq0F9>hMF2`?HX* zx9C1*G)Ifb$h3mp@ZD~-DHmAUcF@lB`a||vvGzy%+PaBN^D}!Ee11dpQ5@o$gcwNp z&yDdZ*552Y%xOYiDYrL=`#llVihc(4yYj;kUW|~`9iDvu@UK~Q9j&w)>K_(0S#nv0 zt)bzNHu4y*ZFA8~(F?Md@e?`KN#r5k(Q5KB929(42DAja)eg>W$qK zNs#a^0dRgdrugu+KqbA8%fUPSb4jnME@>ObbD@KuP9l64>&@^ty{r+G|E z0qO1zrKG!KAWC;PN=kP(2%|fM(J9@G*!F(U_xISfi}Rf4-1qs_U28^Fc_SV$gzvn_ zc(*a!K5v{)XB}j-RjYcIFjcqOWy0D_QcL&miu2AOtJ(!e{@@`+HTKpa6`k5>q`DV_ z^MdFI0JEoE!7`>A4@pCYtEt_jCo@ne^D96$!-W_mP(mw*x{MT3c(S&*ir@ro1( zW6xLP_MayT57+XjH$GC^ea?K+priZAk1y|tBs8&%A-d&vG1p8ekx!pyb-OL{0Pqvt z7r$uPfb1Z`gih5BME%nYNxVX_faQQx`TO6$k7ZBHXm1r(W;q#JdX3vmbohl#)|hfaA-3d?OAnKc(1|LPY$d_Wj0w#5&QvjIA2Dk2io$wd7MOJIOC|8zxcX}-?=Y%Ct6KgeAhu?OLSL||7!KOV# z7TS{S9*bT18D%%hv6Qu@;xs zZAo5ac$=o9Wp=)`+o(v+^_++Y|I)NE%qCpc7@+-9ZfThzGw^D4lC(8d5*(SnG59>8 zZht7w6}W@&lAo{1UQ*l+{UEFV*}g)B_+PiO(rnG{JXXr!vX2j8hPHv~Zxg6ecCL`9 z)5OxDr^PtcRXC+|gOj-l&qee}Sl6h-y5qj50#;7mhtnT7$}38c2hvQ|R`_eHNZ_(L z$h(StERLMUP$07FvMeNHme`O0`Ko^zxf;Epd7wvOJvJ{&-E?t%$L3LYuG5ACUL2h@ z?9L%4UqdyeELk-jX2fW2kQp9Kbv~AF_FD?8sxoG1e>}4qYToHBa*YSGQ@bG-J@Xxo zd_2l>LM^=OnDX&5`6jnq=1lX%8-_aAI z-KR?kQCN8XwqQL(Fj&Rx)1ydzKFa?0LuWnzkVq-8R$3@9pBwk*@63F^QMwwtIl&DK zm`KGTyT~efFq;Qn4aA1q8|j5d=OfnBX*b!AY9Vo>DP~E-3>PkCi}lF$c$mlGCaf0E z1)=PoXk?iAfmBN~e>@kFQ2*c)-lYVSIdr6r1mg6h4ke1hYsw@eKqYU`FBSLP{@Jzv zhhjkiV=AF>R|Iu3K>?~v;Xf3gG$!;~aqSmaTr@<~k)`Ru!8YiBZ8E62NKe+n5UfmA z!#-RKPNm<6EEBpW@1UAtX)yHMJxu#E8|{Vs2Mx*8#nm!zDd%?D2%xNf@Bej1!{9?v zet)9)Tu17|83g=FwW{f}GUvJQ^Vvu#H6Iw>bdSY^dQ-S<#WE$SU7*7K@k?EhvIF&V$ zGknojHDdwB6(XruH>2bd|FEzkoMS@PZj#jStpZ1B-~eJ+wK5F4eQCGJh;h>7jtRai zk4m6Fry1ELK~nXWsFG|z{piDHg1cF3N8TSu#YcSYL+bKQKDYd!CvhY;g%=>3h4!#I z*+c6yn7eS~HSF*e`k$Y5Y?-LNe=Fi` zfgPPR_VORA<)6rWN^KFyB#CqyZ@pkceM-0)QnFfGfdV7j^@g7(Y{TDd$g5emQ2T*b zHHUHcAFQm$JqX`+sbR}M$F#8?qBSCBTuHoSmgD6#hl=l@PsrYC^5SudaFH*;c>rDQ z=|dG>6qjm}Y2t$2x!7@GwncT?jfj+Nwi5BQUNuA?duO?STay+U)7)E2C!|_EB!>bY z1G85D4w>Dts3n%vho~PuBGYVbaFA_>C^`hhy6bhKLSygiC6)`2t=k5Azl*y;wOJ(s zmJ!RIV#+0-DeN4?C?GKDQRKPWl4O7A(U9A$x{oE=0)w}zZ-)pEcq=;K^QeVjR;RY=(9l;vx)3vN#u(wElbz_`c!%lGI ztuMY3?>yCKm>kL&4^z>jdDIwvRUDRfU6Yic@`Tw~v<~6ujm+j=i9Zsu*b_~|6hu-V z<}<7Fe08dkJI!gAl1BqCy+sfP?a1h!iBzARyU^>{MkA3Jbm{6HAHP6=qQ%TaeY)gT z&y&#xTfT<>hoa@@RQ{{Fx1dyFy@)w|{ZD-oIr%iJSQ*MyU; z=dJ6H5%h2CQ}Tz5DipyUE?9o5vpr zrC0=Cz3H)0P_%**aFv%6Ww|dCOTWTkvxjXJb2olb{wjIX*!sy4K6Utz;2J_seB?Yg ztTalMV^x8Nc^-((Z?9kO?Nl?^*`x?(td7wW@<|zEcy=1W>yTxZH2N)Rf{Cb;VqeEP zcaH4rG}V8zSvd>kB+i*?&!uFc!s5|&^8shI1*A>39IOE^+})){-fh#7P| zDw`DQ$v);sbC{1fCMi~wdzknk|J3>_Zv#2_cz0wvdQ?8LRBY!bsOA=ioQqC!hLS?5 zmxFLNzEgPPrl?qkP9Jtr1bcn5k9svWMoMCFPcg$t!sG{Z^?tuCU*RK7Vyi^AJf0Ru zvK;XklZAwM-_Dc{DB)hz9TuD;gVT63SUK?ytj_;K`64gJai4`g4SD(rjzCMLS-En3Xk}RPF@iHL(QfPb_!zQlC7{)Fs@v{}|u5#0;!kb22*(nx?*_z}*q} zz*XS2&(f#&mgUxh7kyTK5_!Ybt&@J#nz=ie>e?%$$Z!&d9J`(Xml$kP6_nOoPad!& zt@K<;-S9}B0+q2RP=_jhM_F8R^xY7- zPeV2o0N(qT(&Uql!=b>n+?e=Z+-&$=N9wk``Zk~_VKpd@7i#M=@#BFfS>466qrvw8 z3?eXlV@VI$)jKf;pM!&@;oSpzYcISnxp2DSJ~sg z;d|be>jU4iPEQ#-pXT=pvUPlGMqYFx-%f+@FI;*r{;y-<_Q#iO-{r3z$fc6_{Z&yy zDHjHLnDhIXOeXha1&;HiXdxk?$omt2`;f3P?pAjdtx1rQM4u!7rRgXwdCA>?I;Bip zpHozCP$yXZTC`5H=L6O6E8KV6UKnnW317Ga;ZhGEC8A{!nY7Hl2J{aDQD@6bMTB}6 zVZ(vCWmmak6yQFxI={P`jJ~AD9^N{$-VpLiwUuQQ4y6s-~|cG=Xm!(bTN zLuSu>yCFciES=I5Va@4X1K3pkYER$5s*3Vf)Yk=(SBY?+M|&6A#Ke(;kfSG!-+Tp$ z)G#;Mfk7h8-p)s@Of}&lrS~Eu2~r z+j{L#;|o>>wi1u`yqO=5fCZXJUiNW|e-`HWo6p8X4D3~*IKoE7y+Qo0&N|Ah5Eu7$ zcEPKe+(mNX?5EF!pXiNW>&6HrQF+t}`r7X2P~CaU8^>`TV|@)tKRy3y&Et`&!f=f}NUo^+ek4(whM z%G@YT$bE!4et1TETHsw%q>#7Uk-A|aMNd#aQEO}`GtJFNsy7~8EDQ;o&`gR*IhNA} zZ`sauW7_?lJoU#(boSaLFnaJ%N=3vki5Tr5*XxAd(dAp+V?Aefzkcga9r@YV>E)G^ zE{D_Yfua+ULVxmyQhv>UzZ9MdEKPd9IZ+RoktI3PM=$FyDEg$!j?_0r$sHN^(|&mg zOAP(Htov&fdR$=BqhOn$s1VxHni&Z3b~ny9496lsoch?Wk_oIw>o494vkXon0sN17Awb$VwHZ|(pN@A?cCv3yuV+l_@Q>m%GkzA|5<{Mu|l(| zz1hvbt16K{XsXuKql*2JbmZC5g>D|8WvpTT^Bg^BYmHHFie}3JLaml*7v!#}V%3AT zwhm`-0^GghB;e}XXS~J%ZAD#nme~w8>|<>M{kp8jV3Z}fib+59COTzk!*tZlx0hw#t%wsmT7H6>9&sz|{eU_38O z#1H+B348hBxyJ*;siJqI|B$NNo+TCgEc~PT^QT|~fg^1(IZ=g%wI#ox=BB{FV~pyy3}{8Vqg z_q_3hhMQC8x|R801(NWbQ`5b3>I*$$N9OSIMCk>#bkWUJKy9=1NWU9HQ&?`oojF`p zX!MKPKkL7|u5{YKAQr~{RSZ9*85I1q{!eFB@+Q{<`ST$ACCy_Z4GFs23L-dF?t|m^ zq{!R2wGftH{K8z*eF^`*Soas>2R__Xr}4~Be8{h-h^sf_?x!FIJqrjsLpHl&a^Zi% z5MWkLHriKp4e*AK;Yo7$KMPLkV=1Yx-(R0^)@9G=763S!Ms*rzBp>%t0D);t2L^ zBWJ3NI~n5rV8a|%fdXUvsub?KMT+=0WiFVqI%7*kdPkJ3fy%d~C*e&-bf$gK8RzbxkG}RsySJYJ zp##RAWdI#rzxy1E5O&et__vo6z5PrtAY<$0PIqEv77H!PcJyp$_Y0uoYp9_-Eb)*E zI$u%ZI#AKe=I@E@eXC00DR$%5tWpCR(UMV#DPZY^nyZ7SDeY3a5?yFzim*pudFj)_$B`l)|hWi#aSF>!JQGVp+6l}>gjV?OgH`UAGF8|M^N1>29TUDJREytYjA7o4)zd=c1o_uFl^q^F@t9P1d8lKIps;p4Mzwhj zuec>0MgJ2ot6!*f{{GICZiH*^d--aEY_ly(e&}|ZmHCjGtg2)RT6BMhxhT2F&yWWE zsuII3nh;-dGb(>qF-?8SiK-TG&iv)qiH z62o_>>8Cf76F__2dN! zU9JIVXnx2$S>j@wujgsXALKL`H@irqSo{3bz35tBH^ps~;(7Zp8~ns-{$sIgx61uO z7eWQ8-EktNXObJDzxyk%@_%?$dQ$%X1_8J|F`3xU$c73Hg0u|Zp>o7JYX{gh=P zt^Cd$49XNsvA?rY1JBniM|B;Yr}z>@8CQk;yNii8S8>$JJgA>cQ`wbd*nj^rI+~A5EX%lN zBN$eA9%Gw};I*&ZEX%k<_;G$(=8|6O#_kZPDJWB!U9;09G8~wq5hB;jq%X-h-$SVn znjbS+CzpH8Gpsl(9lYH8q`C9S6E;NY46c;f+UUuJM!&C|$G(}q9 zM}KDxq5HT>&O$pbzq6US_*-+|$Wh;k6!+dijdKgv)%7c>IpF1R^%tO{H5zU|)H)7`;2$7AJY9>W(3N zNSg_*W&<~rsGs_eu`p~3UFl!pBfgD*x7-5N@Wqs@5?M76yR@kV&y=qY_Ob#TS|15BzBwVquf;!?u5Ni4C%UBHFQ``ZmZe4 zXvyBC;n1Ned&_Ll+DO#5Aaqzl-Du6<4W*lzJ@4)$F=%1Cd1u(jkqYru?%Cax;^_bH zg!G;G6;8&Pi_4(ysCNnK85h_H)&*K6zj?7HX74ket&xo|^s;hfY4c%2Cxi6)h=G%3 z)TgS>|4`aG6{nx&5B376%Aov(tH$laI!#rIoX5U*B9qeRFprS3z*y-Y1_rh>FS3`v z%alT$8L)DltwuJ$KlGo1IS{J@CF(Zq1%IZZJ-dtc69!tr%Uebo>4M`w{UXX!;FR~hZ5-U$|iQj;x+}w!Ty!%`O%c3npKA zGs{YnRW^~#{8kjTnJ+brhL!7C%~djY8B?{hw;<~uoP`-4v+7l)r4TA3(82iL?RiXH z8#!Nhh)B9n>FiQG2*|?uV!j!)#YZOi4%}BzP#B%>{k1xI-SE@$+xk2l6yZmJ1+dm| zd(hKqD6S@ypH#4-w#&eg8}}W?18>RPY<|+`>%<7C7)+LLZSuqHG_jnZqhqx5)x6XB zKNRfzSrePgQw1DA6X6P3Qwet&iYa(_%TvQ3gx6S=@5pvvuI;^Ln-~%m>pcc2=miKH z9NRm5t=QRji0iFTJLDcc=4$~xGV7xK#SLKfU}~=2hh01bGt>$&r8ByuF}B%_B>@_! zFC_n03<~7jC%3$BW#8qSAP_-bOl+2$kCk;LIazkI(OXCndTZ8G5G+n3#hwpN`10-IM)g@_!!|M^sv;z)5yr~6S!eiU;~1y%6e~JakYtzJA#G^5UJYY zrzpH?0qlTX$zQn;+~}NxBM8Yi|IpS*$(?d5#(Nj2BB74IgyWLm_}0#TMX2FFly1X_ zZdXktnS}i&`HLYhb6mfxl6*O^j}Z0!#4;;e=_RS|l~*MnYbZrmuX>w9VZm^`iu`5C zzW}+#vo}=Fg!8UNNW(&a@xi?nP?;nZ!F>(he3>#G)Z3{`x2g*y&)zvsQ#4J7T?6}p za^W@Q3h1@VWksRgn;k)rG%j*dD*JQp@GbxYoXpo%jgI49{FpAnbP)8t$+=|HcsxbF z!i*~cnO7X-eK%kYP^eanz`FSW_-mzV4fNN-=!nL`e)kb{&~aZG#qD+f z{iM!~c3Ugz3R&VLVktJjDw_Shb#?FU1Q#4LWDzP+0y2wWMC@Y`h7A1yQEF_vaHg>eO)wOiBaz^GXTP5*m-_XZ=>-G6Hb~o~y z)K*WybDpkQpM3Pf9Re0^VS<4bbm%qd3=hlI)~Nb86%RO`;e86dAI@z}Z%5(7-Jw6kf@L1IC|q?5sY#MY zoQ;iSbhtlWb<%jH|3fUG+AFWhEMSR zyn|qEvJ&OVK6i`5ySg^jABdUMZK~iU^q7}BX^a$4@|;qq93AA9HUhL^IHLNBTO91`_ISRPhO8Dnz1)9t^c@tA z|JqV)D-Hly*ph(Ekawj-XcC!QC*KF5N)ef8B#y5W-TQ1HDZlD4q^3{`;5=|}3`a=GWj|#yvqcqsP{)`;W z$+EZW(;Gw!GN4uN4V^3HOZieo{H)D{g|UEF(5NHU@spEm?Pk6?d<^QsD+){h8>zkUSA5!K&99w!Z) z_CgtAoR}KR0bjqE5B1d{bKkLo28a(CWQSej_>P;F|BmUozTpn`I@f7{`KZZOjL8L3 zTl+X9@iUU=KNPvXY5kzhJQ^?cP9D8s?ihGCOUPu`6+t|=WY8aAL9{T2Rr2^?EB3Ok zBPdDG{jmdat6(s)f{@L}dt@rGgPoE7tFGeP5BFnZ5W*c?vAqJSDrk9ZJ@?xPh~{u` zScwF_z!Bjf>F{fAmdIE{o-Zm$_w&<2QUu)|zQuni?lQ^|+J729$-^EfgC=REl6uZC zre1F^-<4S~K)GIx(x5g%+5kC4U1Dd=BJ1he=OV~Luz;43#vuN%@04_!Q9HqDjN?lY zQphA-{5rQVTmrBadl6Qj#|s{d=*T#*Rrp0$-i-WK`PyrKb6drh^SfB%?28Mkd#MDc z^d@X+`*CSjk*MwhCGn=!hsO4{BD=KM^Vyb_XOTtdh=T6RbWcv*7U!v`%k;JtMsX5?0J`lKMj5gOw9I$-Ois;A=5NlJPYTI!ID4t#PP+roIIc0?F;#NzV{V$ zq=RxWuW43tOi;-v9 zb|#)@-o33SuYYU{+S>DX4%W#_I^CfemJud-We>&)W`Fc!?K>P-b;h=emnJsZ!xGoW zT6=4cO!Y&Z_*K=|P86$tA!nk6(y?08QX{jxlm7}MpcaQShs{pLuTJ2+jmX}TV|CH^ zf9pL9YKe4sxgR7g{7B#-U<{BSSaUO8s`GU#R%TfkGwSbT?slG$ULO=!1MJ<#8Xd2f zjb~?G%AF31p3s#V_KOd;#!kxkhc4K!+Jp~<-`==QxAOd!@+$&Cmqcj89gFM6I^Xo~ z*$O*S9s96QiY@1fx2VBS%uZ|{zM43&{qvbIff`-#$P?B>&$3AxyNz&j_NZm;fELHA zaMD@j8yia&gVu-0?Oo2FX&-;)_p;zg2esN}UyTSHIh8ysrr^JSjPL6U-@fkh+U_Z; zul;lhi`wOhQN*zO2~3s32zOx%$>BlPIbjs+p!n1Zk%pG5vQ&d>PcB4hH6^e_kI&mX zdvy$U7;y#2_}Oum_4PsPtTCN`!ztMcIwDRpSjhQ%v^(FpLwFMVm4u$ZzR^Z(5dz-+ zWhw8g+Lf|s>$awLg>Qr}k~K$GC;V}4BhK`xx_S~uTKObHvwk2W0E1S}U?}bGmDnQmA!-Snx;x4Zu zU!&l)dm9QHw%3%}_A(Ry_^6q3`%VFmx{JV=LWl)o=0B84^({2(e#U?EZo+J%qjzVI z;9UJ?wSrEBU#|S88mbS^8eNJX($Fh*?S>^T*L>Dbj8_RJ-C;P?)>V#yLvQPpW1ZuS z>71oVKO?JM{Zom?jq+xT6s$ZoGoTPs)T7OZw!0&v`_)PVsObh27%lrLHbK-pji^C7 zG%0yP#>vDg(Xa!*M7YO#;a~X3st}(8S?AHE?7?b3p}WbjX3=v%sd%dIuU;NJE9aeUe|xi^`1sJzD+%%;vqT!o z{j`)&So=t!U&MH-m)^;*LTuy9&wQc(Of{wH<3<~Xk=HMGskrec5k+JVkhV^%n6~%a zRNJJ_;**Q{atVqmckeTAE58TvjDGQ0JBJYc>A&Bfb3W@?`jiY+_F{<*mujKqrKS87 zs@uCh4kjSmOg>|8dj9jr4vXW|4b22)PMv@?ew{DDQM&|7_Qugtgvpwq&q)D3 zb_@5w33bJLnF&3zhLq8xj66rj4bWY(l;oD;9`oS^m8(S5nI32J z+v}HPR&n}oI&ld7ZI?IOYlyL)RsHE4itXO@OKLJIrz$iIzE^a0Ocr|^{Yqh`3p*m5-i6L*x zaeE7~-ODbAPyN4B)R#d=mfvFt9*XsRZ7ImojRz#mJ$p{(Dg?8#pN=!Yofw2k@A2LY zQZ|BgBZXk->;TnGrSSWllX#V6H*_PQv+=n#&M&bJdhh}+Ld}Z1EtP+l<6h2o2M%2% z8P0W@C1%XyWaPnRIX%07ECg^k1K?rNnY(OLK@+s~odO}LjTPc04MXG+8RxnF%>SY2 z^%Bx12Azq` zayXI1OW4v@GeK)?+z&?`q4Y6KuX%3NX>L!85T`otL^{0N(&%+N6URSXP&0Iygh*tC znD=RVkQ(dMHK8JCk>doXTXNMm?f158!BLu1g*yV@*7XFslbn7XV?yPJ!360*+RIXs z(kj?|`QJ#2&oZEs8p&SuOGgdh9kWX8eh%+HJeTgxvia?KI`lRw&BO`cp7$0>MTdS1 zKmZ9EcFxs1;d3u?9z$etFj|JKmGgAR<6^gk*G zpYctJ+Ft)OgK$N1ook77<&c;bId3xKl}cQDJm}&|lEB5M@U4aPg{M>bS_F{BnKx&~ z3|=lL<`1tqR@MPBAF6-6QqWz1G;=C|j{l4jM2u@gT4grU2rKG}ze6;#ZCm#U{~xYWlU zcj@_iZ&{zYo4}o@42}8r<$1iIARy!-6jQkz&r|sTmyYu#e7$42ARBbA2=C?h*)K=f zQ!mR5dl9euu)wNr^e(8qCce94RD54168K8*uWNeHr?0ui`+kUrC&l93_>I$BxMbRR z^f4T@%{PPjxY&5ieO*lNwuj$+2lRt(0edD9JhwxI)F2}0qCY&-Xq{tCY#t6qwBH_S zCb|2%0FVWg!R5%TL)@l7Gv`Z6;;ay&zU)y-od@zDui4~*mmbdxItf9hPBY`w4TH9@ zLDd>DFL13Ovn<8Eu&cXRz-1os#TH>?1@2-AZ1B6*zx4hIv<^U$Od(veTzNF!DmKBw z{zDCDKOVGYlhyq62q-mzX4q0(s5eQUW{eBE%8nOtaDBBe8M7w_!*GwXEezIn3*~}f z)r^J_f)yLvOBE?t6ao#l>u+h|0oaJM9ziroTk6|au2)EDFI$WKMCflmxEa|Zi9KKA zunggIC`LVve{EP%n1V|xht_u%}*_2a1DI+VwcAO9dd^)rR zJ5a~K2IQN<>OlBVO#bV{FY4&n;N>@OeYy9%DqGTDUN*>!(pUVbXRBcIh9wf)`y1TlS3d6{Jrm(-4{lD(Uju1o^LbC4-T)U+s@iwy>-fh8 zMH&8d@2DFsyCum9_^rRkbwjQ2SoinAAH{INgGZpwmfpnnZ76`8@vRf=WDI7WC+~}k zxHgty?yQQ}j<+V{%tv>+UD5RkJ~`^avN`6Yej8Q&B1U5<9RE;G+crGFb7ZDk_V7}9 zJR)9ILAsu4nkL4*snc*mGZ#KVq)af6QgD87U%&Nu_j;DSseSd>YOYl}fmE&?^K_yd z&hTv%<^gBwVUcrfa9eqNxtBak6$%={jp$vuF*deao^3SZC6VQYJ@wRR?^`1IpW zU|Tdswd91&i*%6hJ;Z5cu=enYyZT_>MS!HZYo3k^pQbzh;~|Cg{BSE2=hEc*~_82z#MAIJg z+}NP)5r~AUvbEEeGW7?8K+XHF=kF)t-*?N9$8IGhg^e6bu#K5PHR73`txG1jfRvtY zU;VT!x2P@9CMtQxLjhK!+Q`ub&_7ZypkP%aiB`xq{89Ki8M)`C52oUu%mHT0zy+!{{fzH-@2NDg}EJ2nT)OY>8 z19^vQA}rKet=~@@+^SZ;>Bq`YQ~ihHA90zVPm)qCjd3)9i^%RVBGzZ{!D5NN8wu?l z2(@iM=10Mn;(PNEBa% z-*7N!h@Srl2Px?^wk29~Fn&I!wL~y?tL^TWAoz}49^|%Fz<=&1Vl6^+z^Av%muFFW z=!SLFm(0L1@#~<+?wS5CNpnhmK|5eOya2La3+h!vc-0fvf>ST_d$%~1OnV9b9bVqW ze-mwQO04Z3&<_Q1Ds5|cQ<8!&oOj~llOc6U6U8grT#SyO|cXN zod9c+cI*R3AD3_433~Gs)X_?oR%v@dc~uSpsyV{n!NA}}_g%mjU}Xz3A;?K{$)In0 zEwk1Y+|g@Sa^Nf*wctLJrhs6k-k=;ca{2p`nk7ofc@lV!f7nRF zR%!WHw-a`!{K4c!09Rbeh(MZfN`QA)dIz#sgXVGtQpbYMpm1PafJf{;d-jljcCaG% zm#ifJ#T0hEB`8dbpjDw1;VWE4b7i?&D0ThO4%jJa)W2-=R@A-Hs&AfPe_%?->QrU-novZw(nKPUVg5=2c6QH@wPYZLq~3%q*NX$N2pi6c zdh>!8volxo8)uF}57yNSjgn{L!vomJDN5f+E!{I3M7@5z;Av}fTT?vP{Apx>0I?6b zsiLGmD21`GdN-Iyoqq|i?I2M1wq7k1o*65ar#6iivktMmMg3XLb++*J52MFkqqCP{ zn(|A(Jt_=p&Y)@k$7A6?Q~1n(;vp|I-4*X&=Fafqlf5Q6$sD=8hf!x%ekHW%pt>nH zFa0^gr-oUCKRc2nx+`u;R|hdyE39*=a;)`vY-MC(+W$(@Kn?qq2$<71+1W}3yDx+~q*my9*sZYu_Vq7MUMBE77lvcMoM&#b>D8Z(`(bN! z4!M!y9fKU3@s(5FxkaUf!HqK$VC~iDOVuh9qv_G+={uxWdWv$~)v=bYM=1-g{-|aI zb(TM1M{<2Qm?1$t#_KS;740Bgxg~K4YcKD4J1rSYtlA42n~j{tMYs8*lexm?nZ4#X zJTH>rC63gwG|<49N7`p#>0@7iYaEkIRIXgHAl~jU2z?Rnb?ZKn859fxn$0!Pk2UGT=TC zFS*pkGH`#>k5ZQNwurf+E6MkZ>BE3G_;23{^$ z3zOgf@Lvg|GZYp`V7R9+W7@quLFesiWs2eeuA33oFX*&i$=YO(sTwc$3oLGFKonF_ z(uBu&(JCz&H+}dGY;<5XclSo8sl{WsGPjWy3(~7(%B9-5#I@5{pD>Ya-XUaxBEtUX zvI3nx|6AFL_8Q0gnOhFBG~AabhezoN}G}LFKJ=}HbPt6IhjGdYn#|ToEK~}8{2?%~y#ZiH2K)3^3{c2TrNrr$*eRpiWAg;&pxoyGa zi?V~lqWVgfbeCq^gx)&e!6dRl{ro3H+i+E_mw!pyH$uBRUbt9>D>n)4x_Z#bM?-$Y zaiWCJ`_yfTz_3c_O`MqK`67wYo0IoT6_9w#dx3exd^o|d!>8++>288 z<*M;y@ljQL%d9XI?EBvF9j<}~%4_*`JHdnTvrt zt%(PvzNz5q$l>G%>Qv(03t7#EW$Ps`#e?|^DuthEp4Fn>F;Scp1PuMckg*=CvUzu1 zCo_m{)7*LX-;$x*aOJN|8$YK#xTzgFwp{%`=7$Nqg+$bPx?qofE@S6#dxfsj>@*0= zz)xKEz!AN!t8MwJnUCln!;%NunP2Qw^eH+Hex{1cjxXLMBty@ zSe8JqEv6FXGGAlOSKvdkn3Gp0BtxDjrwM13yJVFZ#Ib-COg(l zd*y$g21Jb>HdW)ZvAN**`k&3jbdCz-DP|;c{Cq_-?JBJRWY^f-D0D7}vsG;7Nzb2A zeOTyOIPZ+kq(l1_59;|3MLjv!l0}T`6zAKBs)a3h;3H0tq3Lx2-}ps?u;7 zs)NU_HUf{NQ`5fx*8nrtt4|${Mgn0HgG$*}+A649QH+cxLT!3PK1P=elcyWxm#02S z?B05_Qmi3=vaExy>vj=U9*EVVoeRv#nul7g1wACmcKR1ObC&qO3uscy4WY&Zd5Quq z<}<^(N3vo_Q?oMB4SsjYeLz{#()K%NSQ{^a)n(YeDVTcA`GwaT=rW8RYy{eVk!tDn zh>Gk6q78_QpUF1oMUc(%PqOc?Jf})jc!}ssmpMsIpS~0sTHG~Dr$JZ)JwHv*hy+zT_7@Hn`pRf18*-ACB&HluVKBlIn57~^u zI5eQ7$2Q4DCOFM^Dlg%_J8lAmgSA|detp=`yphlY34kBSJU4(hJ;6>Cx2fR1>4&9} zOUCWtS{(363^#AMp3UdJO0=Pd-KAN-dz#$A6Gj0sG+a1yu zJ}}OGdaOnYSUiy3&^b-mF7Js4H%0Wnmm%#)*n~L^QH`})JWb*NkjuL(Yha^g#7sXw zs++$ziO{3?-+F^#3=hmg(XdqDKk?sETT{M-s>+CLh(%kvY zReu@o`>6?`_4OB#sa^^{>$V~#kc95bH@HqR+j^DRNy_dY@9N9!E-IsFnjJp75=qPQkA>TCzPESYY84rle_8LZ zTj}0u$f!E@h>D&j*#D>q$eQRDHX_i!HlYg3P9a7MD`$E|U3PhUbXy(gF2fZOD8eUbTFs=# zlnkB&wx&^=RA~-)F^*<6jO*CQbG6H#vv8goWcT$3K0_tt0QNLhb85$eUOwTxCfC2OxQ8=YbJiR1d17t9K_o)OcRJXM8u9fV7Z<%ktISiA*J&{b6&Z(_ zCJhN36whwB9Wlo`8Vfw#Op4g-iTmBhbV#*-d`=n>b1SI!)FY!RGSSPpG zJ>ImVI{Qf=%^}f1`&eVyCRodC7uR7TJpzr$45Q~-R96!zBYu9J^Wl38qPj$GvOtVf znAC*_1&8N&G%P{4ni^Lz$STfeBd99EGu?fhYJGEqT>TZ*$v;({dTS>m0eL)`K+All zGm4xZ7er>Tz|rtrz%xfwkhS+D9h;PUWDUL6N2_$&wx@4ngA}&aM*0|v5bgIWSJI|- zc9i>>#xWdM>UA$)`Jr2~?A5vsig}kyOnNH&-`~jF0#jK^%|e!ZiuZrIY>(cKo}^DCc_trGYF=1mcN!?FvX7$!914RGSaeo43$cuAn?*S9BQ8>>bQ%R%%9 z#v;yZ0;tam%v0~n3txUs{~rK9LBPJ^(omOn>$X%6l#zR3j(Y)y`e&sw(J+pdRIgx$>?J^ zDQdSJiGewmW88AY4|<5nEbx+&NJnz4*>B9!6q+*bcL&A^JY$dPPRlajxybo}M#f3+ z?_DP_)RIl4EYh5=JM{GJO41*nY=?Q}oaB9JckW|^9BnK@=O?vA=P5$y?=A`Zppl>J zKoYBN!lZ)=PXzNrv1R`NTQldfv0QQ38jeMGj3@;do)J00$FRreNg{4Ruqk8PY0fE& zK}6C88zkI>Z3E2D9>7xu*w{$ld#UbDDYL3HY+DMr=Ly%h^`w>}7i*E*ZaC@s@k|Lr z5Ltt`1FIIm=e)Z0hFOj`Z-bKa-AD`hs zhq|#iX{2BP8Qy6^Iu<@_0yxIvDbWdB?Sz8COE7K%sOrSz06otGy>UJ!@yrp!E}f#c z+Adg>YvssP^$?D%-M63k2q)&Ll|>yHaHlEkYj}sn*4hW|RgCx8AL#1;0C&@Q>^;8s z)2(^6{pPpgSDyal&vPBJmwll5a4&ENLHzsIL!o#YNs8R-3hJ$rwusn~{{Y`TSLiwV zS6!|_93BzW;D+2o1a@+Hai#*V;3^t()#!~Aj2)w^JZsvw?{vL$!5W&}!Ej^2D;?Aa z()ga1o1uzkQ1Wv7pNx&y&!F&MT6oXj8J5?_;J) zT3F-suOGeaPaWON*4Ionc~K_imp|Ps;46Q_YEGk}8AdfMD)VG$3 z6c6{CsP(R2#U2!kPP!vaoi0~yWR4|mp}Me-aop&m19M=Lo-s?|uN_|KaEn`rQu@?w z@>1ZI5HL9m2k~t@g>Ey0>0I-2sJW=h{l&=^cM)4(Lh#LHs+)LWBF>wSOXd(A>ph4#S? z#8PrvMu|ra$zoXKo=<9Rz1^kSyvt`O^BtVL_tH1Ye&`}Ka@=D%@0#bShOIZsq=e>@ zEXk-8P*@TNE9U`j10Jo0`M(@z-l{&SYApj*o0-}&@hdLxHXWICmmkB?bJQ^h z)Sd^$Na^KU5hsv;eFIAwR@~owHh+wKs@}K(UX46DRUGZV_5FXtiLGu;aVF_jP1z-I+~?c*VxxJ>F_4z? zSP|U)|y3}(#xRQ+?$~s z@uW`6w}T@uJDGQF&)wXkCbFpG)OPC0%r^G&+%}k(LfTnN!xh;ELfctWEw>xK@nksU zt`8Zi_USOSyS(z=TU!(@B#pHrQce_SdECVG9Zx)Cn$lOYwbBj#wf2ahl}o@PF{Q=i zoe7M9aDy9s;&4WA81%=wZEwcX#f6U2#Ry9)O|p4x#Pm!8UPIKm<-L0jXu@-MsiH$k zFL{|!$F2Cb_*+^*Ex{#MWNHB9@EZUR@cLE#Vq1R<*h`>ka;(=E3I?U9r%Ai{F@Mt_ zdY|KAo`4GNtt>T73rmLIM25y|gO$CS+D+)AoWz{`&5j;U3C7XJbA_k)Jotj`wJYgX z(?he5H#agkK+T`^XixDTGH`L8g|MfpQc0_;FU-1qZNKIHeg;2;H0f>TU2^5_W3yNC zB8J@l`%sks0Luu+agsM5;r{@5iuQ{wUc$oB=hoiP>Je>%bcw&w;C`!Wjr>Fa3xILJ z&T=!GCB?O^v|445w}NYUnQt$j_g~od%rFuq`^fE*04H%{^PG|wn>t1MUD;j>7|bve zZ{htow&_n3cmMrF&|FCx@q7iYiq{eQu=yBT-PF6GqcdwZ$Vb?nGU zYe<-2A|K(E#{ho{{xz8Mg(r7ZUFK=RDyz zbv^!-l}o5|r?>U}f5Vc~Ugv-7`tE!d(jV= zh>+O4Vxp@B03}W;#b*lK#H;{$0zEPd(0~E%mAf zg3v9tn-gO%G5KRB{{SA4{Y5rQq=N2SwPu#vM}$QK_495Hes6wvzu>glW&X#pkd4Rt zJR5k}eBNc;i|d09MMny)mBp=u>Y^PsGkty!A*SO9PuI&&!K(Cb{XeV!0A6J3 zwCeu=q`$7rc(;i!Y;^0L-d&0{{rDXVaLft+0KaKewT)-QP+jQux&o!W-m2lRr%p2W z8Cck>X9OgBxZ^#!t2QY2){&&d`TEt#GRcls+A>SG;@i*s1XJ|gMQ$~HM_HCX^mmpU zWKurkODI&w`>0#<6*W_{^8UX+$f{Da)35dUYx6jLYT==^@m8^KZ#;3!i@HlNo+qPkqV^|#~x zNfX&zPjli6{Z*X`MI6^Er?U~UKk?@u_sw%{CD}hW&OLKl5Xts$3f)cQ?R(p$fDZd2 zUB~|b9&!0rN7|W3${M|jF|%*Am)v;Ri8mGJcl~|D-ejNi#D(rfD%`mK7GL+6)n?QR zoH$(iVA734{+Rx?&~jY2_Lp(Bh{v*uLeLY2lW*fy8uB*I_;c&UG)-Zj-a^Tqz~X=| zZAL#O@(gyS>q|6#6%2$(vQHugTwoBYdHm|AZ85nKNWP}8-xyoMD3K4``O`>FdEJFv zSCyQ~*l(IWOr2j6y&2<^@goCJnx5yGT=e;>16apocFT4ZV#?94uOYQZ2-3Jf&Cpe9 z)bQ{8j21sG@81p&(AC9Vq-9c0Fl}^0qmx$f#qq{sX6hJjj#2!N^A*}?-YJJxD+q}r zy8i&44U9MYr@zy$*0gRSv%0rvZfw!xVaRXu*0^1J!-B(g&4iCFzH-RlelPJJ{{WqL zI8KhS)gF43E9osI-0z-E+{tGc`ef68cHJX%`WoZ(9~BjKw$$z%Cq%d$k^c9n_pZ3K zZUOsF;CgeM*F`$9nz}h^Qk6Gp5)YUDUIF^^O?N-Ws^XzAqkY*i_)-f}KJCU&xfP+G z$a$XIq1>NNDBlsuOY)fGnn4_^%es~K6w~E(We4d%dXj@9ecehH_ReT3`+jSm#q4x;8iu4&;xAM4?7eb}mPBqNC}0Qw4PdqyAX9eu$x4)#36 zQ`fC9Q-@JrA@m5IUSk8&s~)vsxyb(jW~ig~+^h!H+s;0f3~RZ< zjAZ(pRgzhhnN(wL5ywTxCm8<#Jk-dMB1Hf(eVchKM{&qe@5g%4w?A&WR+)Uea1^|0 zk>-W!ecO^r_dddtOtX^aWhkiAM%YVv7mVX1ws`#b;FEwkpHZCYt2L9t=^&SIh{}I; z{op%{^v-fdc=YF-`K8Gn$=u3l#mw?a7zr3+U5^O4V~^7ua%-U0Zq2WTd^WbE?)yE) zCM$*m&u_c^-ko~;*E1~gToJf{i}z7R6@G%cO=2QEBjJgnjgHtZH<9xN9(R0n<+>kH zTeOpk{BK%`Tm6Z}l@A%!w7a zWiG^{WmF?MVyrpYft~=|1}iU5)g&-kqE5npvCCqv?sl$Qa*mviP89KzTRLPHx@0Lk zTK@oN+sI1!*AUD5u(%)VAj2n4axt8fQwej=QY9r~`n9x%+F4DwTxLlp(qWba&++Cq z+~*?$a1F^LHP34Dh^Hpqt)=?_S>lr3J*zMbRz^AMPDXp+bI7f^TK%9B3u_HXxiZOr zZ5g%T*q^66yV7VvgJYiN9$gfVQ*(^b>*8#U?C`JHQcE+f=L{bA0k+!0x;47N)>K^hUj~7k%LYQ zbynZk_5NiWp(@^ek5W{7k1ubSvEgqb?IzMb_~^seZ(~_HzlKaUAKDsApf;_!VBM1$ zJe6PXM!<#{>6~-Tc9PoKYx;B&Ne`7cC4_;n@klst%bq%7s9Us_vN)9n`1d5poz!yp z^u~O}^;cooQ*qZ_jN>P3bw?eec)|2ZV3JtgZ9YM^GEQ17edt^J$}`4!0~qwLNm=6Z z(&pMIT z?j`W#?1Dy8mWmx|Jo|Vb# z+N7FozwX*Jo^1TmWrqZAz#N?ZW0Ql7S3k6uHKNV^oLp}tOkG(p&Wxl1^J5f-T&5xn z%;)ACNDWYF*GB%qZsJu@%CZMmF*k5{1bXKb`4JmxyskTuxRw5#SE&gpE2GSuTwR(R zvlz+p*?M5GVtGGKl)Idk+%xNzC0FVyDAFC{Vhn-zf}9`m{{ZV#nC-!4`NvKU27CVi zpVF%2jyu8Z8HPYHoR7wu(?}b1X}Qnd&&p5J+L4MU&I>UkI3N9bYRH>O-ys>n1yANb z`qUr=ERmHVLw^o_W1ntmNXa0s{QSfWbM2Z{l@7?+C2W1?69wLT(VI~V-I7uKl%-G%Mr-!D73rj`a7B`cUmzLLedm0rbRsL9X_YELw|W|rdhSs^x_L( za9NjmX8z)g{LSs#zAGzT@kO_Xw?}ailTz5ES^2-}^T;wf{oh@q4ePh9ayA-PtK)0m zw`x}{9kBUOs(%iC>`BlszfY;BEKE|mc3s(VqW6$$8qTfbE7PXwkvvv81jHMHKff~$ zKMsSiC$CeXcy7Z(DIhO#d;C`{oG+@P_WJd$T{g@^G;hxbMYsWs=~r&7(fk~NJr9gPiAcl#`MR@WC#G>*$0Hjw$UzdN69?ydKEJP-H? ztW9I>_IB@OdACt%Vcczz3tW4*n4m^B@tkfxqZMNBQ`au-E?MrEO|+E?$glt=k$oCg z$NloSA6~W3T!xkaEuy@=w}`BA+(99D<8DshyUSo_uWIw@I7e7&`oHUP+wNyh(MhM4 z`%CWmaTUU|$fx_t;O*!;f!7$UtA@EU`4ZN8Qpwe|W(;)jT_M0BZaIwsc65=7a zZ$dwb2*}!fcp|l~${Ry9v@Fqx^C)D;nHSwBXgS=Wj!$*#(zu%##og=*QQeD1yCVRW z=l~tEcopbcPL~Cm+Q+Ef-Pu7rCS;kde8r4{%aS-CVO>~r{{VcP*FGwas%wA1nJesh zt;{jRS}?=Th1nQ}EF>A>g*mj_rPD?Jz#t?Hh8e-g2OgOOeMNNIr-$^dJ5sc0b zEsh}cq^nwlQAK+y+mg_3K3O9lFaH2rr{PsGU9oppDHr!2?Ab|Qn7S@-dSLo-{3EIS zJAo1f(_Hy;3&AL3CVoYWZ%_A&bf>$j*v8veBH)mvP7lg4oP52$r|DL-trq)R6Wu)V z+t@_K?ys&+*pZ0O)tL501&^nCRvoVFRvGFv;K_)<#Dz4t9Xl6yY0vWGl z09@Sya8P6YRAm1E0-T@XC#8GG!g;JTpAB1R`HRl;WglonAYDig?-+WjoJJh|;=ERJ zO8WIJM&4}l4OhYXt7`Ji;%!bljZvgQEv#1pNz|~&3>aXJfCLOZ1#G?DwvbxgCW&=v zaRjdvtWCH$Qxr_Rd58FzbjelkUftra3BlqUb376yp06BD9kuXGykWYI#|PMgGH`h{ zcfJiWXW~`^~-_?U{w6Ta(-T_gmq`F=&om3C$_zi!!MQ}RFX0Z+NS|+U?6R7M$*2x zA9YP)>34o~I%7p+9j2o^TQprcPDPv~w#oVaG6_4o{L&xc#fyNKkin*D%d9da3AxLXJziB*t%(AwIk zA20WCIuGJw>T051++X;XS*4EWO7PBwY!RRxt%3O?^l2q;;`JrJhPzE!29*SU9Q+|0k~Z9Amp>#+o~jA&+~$%^I>GJ5jpT7$>Dp_p#nW_Ko4$6S{{UoQkM~v2@mFk; zlvB05zhCk*r#UCfEqnfp{QG=_v(`VfX`(b;i`G0oO$PiIK*tLR;V~k(~6|gnM@Wq7Xt{Qti2n{;kP_%{#ipP`n zFDK$vkK{E%(`0D#k1TsJZ6 zy4gila<-QL05=(vk_VpUbN?bPsRTL zz$$ox>U$_O%{Or`1){v8{+dhs#{7c;_*Ng=t}~W69;X=nD*m;p#i;8ymxSZZm6<_P z_jAYTip?{`4-Tch^InYwXe;&C?>y{F{o_lfpLT=SQXuWvj(xHz!u_BY)$u|JoZ$^O;(mk;j)tMM_BD+ZI3j7 zZ47&VwBPKC=db)#Jk9n^86`sBDW4y`fA>K9gZ1~UDp!m&-J8L|SUW)#0N5J-%R|NyRTwzpgP@%A(3gk4e4}_6syHuIF`CEqE z`}+R?p0(TE!Uu+J2|%%gm<9AD+PpmCGjwA*kL@i!^V!PhOWU|oTP^#ZOF7J}dh<}4 znrP zzua%Bz@u%vog9xrfN8(D2a)|MHV_}YP<=>0(!GPkF*T11MLy~59V4F^Fp;%e>Df=| zSXzFoccaeGN{rE|`QVd+zfVu)U8(yz{n>!}W~pjgZLXyk?}!o(5>fYu_>XGmsY709 zXwIs2)wwFGqRpMwAepG+(G(x@KEc9ZjhG}y_-5WdZSE}yz;)T1A0PcfsdF@g(lk+f1QT*WH{mkN#x8^G3>=YeRBN zf~81J*#P{+(}dfZRpV6M$3t1`tNF4+JP4*aBP!qkcH@xWsUMYT8};8GnGo9Q7!k+e zQ4CtNR9{xV$UT;>=amY0_o&4_xzAje{#qwYVxjJ8#uI2KsHU>VxMVQTRt=o~9jbbg z$d?`YsrPxPSfdwWs-!O*4xe6}QzEyLX$rsaj=)Jezhbd&)muqpe~9dAEkHsRBW}#mGVb~DxmfkiKRSgd^Jur~7wC#ga)fbR)Lue(0?jV% zZX`*exmB7WxIu|kfnYlp0CQX_DchVJbgs$enLH7rG;_3DrMZkNMY%|38EoL~z$c$` zikjU$N?OKW*Ha%!iZgctPCg(2qTPl2c>lv8kNqJ{&kkEC7sC`c_aD6y@#C_ zp&iPSFhQP;Ek>x_eji7AsG4gjAW?o+;*xO#E{Lc*zCJo zn+v;Evs-|m7UZ0^KZr8}j@3SqrucdiZ-2FJ?Bz(Ee5;7WkxbYsWAhX46V5jeO3c2p zjtw_Zv4N%!r@Gxo<|ftyagDhL4aUaE!1k_{Q8!}a9i59Beb<$!>6eqQ%Qh_T!9697 zhST-oPvKsU_NyfNjh~&hrC&NJbHs|rIBa(hgX#}Xwda-wE4@_})qJbd4g6q$N0^K} z!T62!-$`&|m*tV$%5v7lPu`CoGV$uUBmL4U<0PeEs;v1e^i(Uh9xEsU^F#iCi zsq&IXuc-FVda-o`beo0M#zu@Br42a-+`NXm+zta9-gX5tleKj zu-4#KxD9VB9gQ#?+fVDC;*GfLj>LDXMocStz`JB6!}*6kVEdCP>`xUVJ;C#uNJC2+ zd3R9-z>yC|$A08~D@OZ`uE#a4L#f)_$9x@N)({9~okz)Ve}#^FK0EYqLB`>Z@51-- z+-ND{El$cv0=B_xA~N~+1EWUS&H))ePim)Yr^l?rJ&o++>_!hW?JT90qUpv5%=yCO zAf5>5X?0uCCzi0=i+d|^vf9}X5l5Wk-H&tY^sXw+LGr}xrFkpJTg_5Oo?S-bS9w{B zIs3$V;C(uMJ*y5~QLX$>X?ym2dzQ737eyuFW(VbQ+wQXCxH->ise7zOFAv#dnOURV zYbQw;zXJYKTQB->WUR&2EVA{@$9d zzc!_4u-o0-rg?0o!UZ3Hr_KoKHsg=2SP@5Y7nyN4k(UWG;fRfnal8&dY<=YzJoGim z=~r!U3VD$-!Z2gFPrnnMRkP@M`i!Zq`&~#Ic`f|7-r<#$NEaz_pW!39<0Cyl2dd(# zDsXp1%|c4g`u@MJ#9PZqcYK5l>;&iOk4#c+b{hfNjt)+9{Rit;ZCY5q&ut{)Ya$9l z%&Hm=OJVrj03`A|9F5&7DBUjYp6=G(;6|>#bWYozGE^U%7%aKnw1Lhmx}9|`C85Jo zgO00IS28J-#=twTzIgS`IwG^&Bw3C%+!VB*AdjA;{{T*>wMaai9LC#yTdDj%#w7h2yV#mE`Q@t6-SQ)|OR4-Y0*Z`;y7iMw@SnN#lL_aP84(m=9>&LY1Y!qBQ?d% z&K22C-NXU%qK~>TJ^1U%t`{1(hU&^*>ifMHQe4ttbt4Q@+pL%G4(*X#UBt|BpOma& z4=1h(xFD7QP7jhsTYw)nd2;(RCN94x2 zhq|}EyFl#{s`n5iN~0gco}XQ;YfLvZ8Op*Lgj3~wq2dKtQD^KlWISV5TyQaVw4w>3>{{Sr1 z{{ZMs_yJJdyW`0xY7R+_$udd1CCh#vn9pHc_?q&(X|3N+CH-|G5KG?dP}J>hbSTBX zq{7i=`His3$v-_=d69|#0BiEAIx^}yq>>2i{>n;z=~@=JoR3|(2^jZmfsA5{n{6{g zwA}9}+vHO5#<3T03vyVKxw3iVXeaoMGe!Q=g{AZMyt6Rf7DE6Ac8AFbFb_?nXD2l( zJnHdx^#1@q=@goav;MwAHLr&jmzK9$o#vw7Xp;eEGOI?1px-A7-|nDe>5B6`LrT@W zDX1jtBcRS8zJqe4gB^(g{sd;dYf`y$DU$MUK2SCw5`3fCP6==L0=MB@?x%Bni98ni zd{fCZTUsfRc5UoR<&XXL;Qs)!D?8iWPn!KV{7n_40D1O|bQR_IUIx}}w6lEiL${p5 zALhq%&j;z6&eL^?EgZ)?+`Y5$B1bDJBk7zs9<`NBGIvzEbpEy~E)P?<)~)TVq?Q!a ztpp{Rw#yhP7(GA(3Ai8bDeLWAZNkLB$+gt)U?of#q&~*CAzOBYUug5&Mq}KbbdiW$ zf4mDYVUF0(rBlAPNW}LWQ1DoooA+$qsy6ZT&0!8mJ6~VaS8t)uO}ZJ09HVkG-nVSE z`?zgkv|UE-D>$6IY3A`89)44fI&*>VShGcxIYv%$0}g}os>;E4D~^5ZvB6Z0nC^Bq z_tu8aC0!CdT6EtdvVyLW1CT_KI~mf;X6gHN)O<%i9TJ2!ETrMVp{4NFAO?(Ian zcBx`+6nw@Mc#!`9zZ{=5XZ@wzzPRgIu=sOP)Gm#^x-7EByQN_u@`PB(l*#u(bsLET zk6tS3>Uw8|!2Zc()HQ`X>QYDhm=CSGFy*uV04*GUx}&XY7`tf7O8qn>T@J@Zu(rAJ zKKR^kk{v232)LLW5M(@vWB2>**kcBZD<2U2M7w)!J{x%LtRgO!vA2_NY=q+qJy`eU zo4KsLJ3-d`QK`zd@yT^?e|vE>u6D*xR*~DG>+XAIy=P6+H17myZ#A0Eki20pHD*_N zrUYaxcH@KxlBN0qo^wS?iVM3g5$1MUF8=`6_1uF<_;07^)-o@NVX>a&m39bjS!0b< zf=e*NJGuFGsQbJOXEkS8*5%eFwebFtYb1YYhs|LHuaoA+tC+~_iZ+0};YTFY8a9wM z*L&M96M!NcqnWH^2HX_L{mTp-sB93&?(WIyTDm`m=G%0-oLZF7T@T;Sblz3Qt0wFS z%7R`v2V8JlBPSeJ9Yp1<*|aSkAiif~;C~rKszv>ob9o)r&9=yX+Kdc$4nZYB?!yP4 z;sbUDYhT3v6R`0daa=qW{{Y%nxob;_W8Ncf+n4y4JmByTHNon7UBBD3_0~L-G#@RU zlqOBEji{&&8|5J2@#*)uuE)bZDO;PB(REvhS65XmpH3)NPPMq-p8%MG~%YHV^!>O#Gno&yso^+@REx9Vc|x_5F8$BT;WItu%oS ztC_Xi*x25)j$w{MPnChu*bo>Gx&o)DtxY1z*TMFaUPPbi{J^oh?pPf1Vqf<`4F3C2 ze-&hSdr!R6EvzMO(R`CMxLiE_Y!CRT5UVF{{RnPyhb$5ygGz}M-oM&+9yrl zF^i}f{{WtnxnrOAkMPt@`$fvPPw@W$hSL@ zyX9#}qkJUFN#!ruY81iw6+L<2PM^Ca?u*Qd)IMn)+181(Vxol^oWi2ryIUqWgt5# z_2Gl=Q$sR@(7x{2w3h*UZ#!Nl%-qLc4$TH*d*(s(+<7QoPgjG~IuhZg32CF`ce`DmdE{?n+jV#jxtoRWQ5 z)-JTFK3acy(x}s-xvOgXf1maKM0Xb&w~Fo&=4Fb~EUZY2wirR`5r3KLdz{xnrrKI* zF>QnaC-F#mcKtZ|RKX(y0xhaK05AvZL5+I@^etM_qZn({=dE6KciXXR?P5dIaQ9JH zwaL;y59#qC#_N6L4&1SR<{(@^bR+fb2HJf%FKyKOo6IV~-Mm4{O zFGhAAL%8`PC)^zW04h%+PfUNn^Ga?#Qy$}rPqdz&`Vg;MogO2Ap%r?PPqTlGNCaQS z)4v%0Mx*;r7}`G$f1Lr;kGQ}0&M4-%xko&9a>miIk&uiKFu;$dE0)#t5p9|5i%ADk zj6P-m01;hQA$9qjFV%moF4zIUW>`hVY-xVfC zHPIWzWm$~DgLIWJ%ERm5=-4%%s`!51=QCNl#WBfbZWX`4))l?o_KR}ykC2`kHyf3| z OrA^uFc5=e9ymva+m6&c;*ykDk5cuiOW9WaKQlHPZxEhQ$%_|JFshzwx-uXvk z`W*XI7CM!!@NjLVfzb?((0s@gbtQWKRW$cyVDoL5N~4h?Ht+tpKj)g}#nrl8yZ-=R zh^fxC7ofAZ9#(SIxg?LK7qP@Yn_^Oy;#hQ zW1c^|fye&PaF!w9gY`a*T1)k^cbKuHwm~Zvfk~TFr1{ zxVVh~K*0e-Ip?oFnXWG48C9c_Y=@kAps~-&dK~@+yL(|2UKWi}1f_2z+9Qa_p!sSy z4Xn8y{O8j(7@IW|%8$b@>#88Ov%BA97fZB~8O8*$LSm4v4+NtOgN}RgRPF6!n&w2X z&~+F!GQTWPTp0@B@^+9P9^ty;uSs&&HZvBCg0X;FOKDYicdxjhWSb1!D_u6TY>UI|mZ)vJS6v(B5Tx@ux zO~-cT+qiIgZBvjNMvveCzzg&Zky+4y2IBBg8Bl8d1A!Zz-gU&rU!TJ+kh8}996p_s}X45R)w#k0d z@UM}y#c;l{p6C72eXoodq2B#>Fz4Y0=Vl0De>Zi7950DIj@t)sZNw4As4wK*;ibNKdRKZdyBOPk)2 z(<$=WF)(f|CWp(pnF-pW*rtBzO#GyFIOpk7L;JJ5bAz61V^P!%`VCUW1P`(#xQ`rf zUzKHE*vjLMoQle3j!`6v3_)8f8?WA6^NexDU92?jXjn5urdcMD6-SpWYBv>A{6KVN z&r$e_)Uso53+*sTDiGN6^X5i*`m*zo!wiQbp%uzTT{2g4mOEN4*b3dspF$3Q$*zLg z=D3PGc=kMlGQ_BHxqk86>CwJknW}br6Sk(6ytdPsLPiNrw(Tj5$h{8J*pKifahh3W z4=fgzn+?DwIGNO}tbl?dBydPM!T$gORc5%gYjJj6AoGw0k#n|Acq}vWzH{>SJajdW zajpHX2ePwihT^_v8xp}XG1)NB{Ilthe?Ei`*3?Nm-(zCq#PO((8+({prQW}9bXStr zPUgPe}_gQoZb?)LIcM_svV)R2i3K{Gqt=c6-`lwWq_;NzuP(d^>1x?7}J zKHVdE;x}B!$mP?Xy?FzVYUrcANSn_{rHb8=Rh}iuh4Mkj9gjH}#|Mh$#Cu9{ZUv_a zXvsyKT0A_*7oNzdqBYwbDs%Fp{{VCjdF78SoYpgWmk~ThH;JxKopa6zi==UnMqs~! zhCY>}eSB{(B6->$A~o}T_+luoNW!F;IQPnqJv|m{7KY6&rMvc%J$h%J$}HTiig$EvP4 zRb1maeTcZZmg4r|9d6ZQwq1|7r~LDYewoQWs!e*6BOagU=KlZ>yv8?Lt;)BS zmRgLNzj*>9i^?Ujia(*w>*(3K4P!Sv3v4&Wr6+{=bF8x+DewQPY06>p;|h2I@Sw`yrd|qu7K@gssP`|A6)(dyr#PJjfI7~rMj)TMcpxJnF+@~ zGqR^1*{<50>8@8|=WAJ>k*C03UE~J-6moed9PQ)#z5f7Oi&A)|xK)hEs~FnN8*MWC zWB6C{C%HW*8`XkQI1jM&|doP5sM z{uSrkRU3HjZZ4zpZT{6Fjmji|NFxWQ?zae|9`ZSFRjp~SqMGt)C$LwI7YT0|F+v*v zbYGaS9R>?ylUb9=Jc+tzl%#4-)xcHu3OHKprwWxL%=BNU_5T14WZYu=o3Z$JPt$EC z*Kb1VSK#0YZiAO${40Q{*#7EeLGQ(3k*@1kBwND{oPcjCmp)uk`m;F8x74s6)XCau zE*-v7yewxBVY~(TmS$H2LOB1H4Q=^4@9Ya3*E|2En{)#Cm+PTmG9Jd zu6R?7S2}imkx!kwBg`%>bqx;hWbw&wAszZK?bKpIHAznRDImN>{AF^qn-%v@Pb1{tB97a2ZeWpVlSt)V4Rd&^TuyXf~MoV+ix zN91xqeD&+zsyG|YFhTXD4y>R$jQ;?8IM3FsM`-h+MFd7!nUzAIW^9v%PypnyJYzj8 zMIKR-OLY|DDHus5#y)+@s~;b9DvsFepHo`eZ---(PhD47x*C3x%t}7Z(%li&U_*`G zg^my&r#w}=T`teVvqqQJ>#FOrFi5nRfq3C#z!R0_-#_xnf8a-|)$`!*5$m_jsCcIE zY0&}^q`4ua+Bzykzja3debp5gj28Ark*410=k+~$O-2;a^s7^6rtThHoNQg@e$OLy zmHz-i{2*6hpnNxq$(rlLs3ivi-pV_GttNVz3Y@M78yDs+*B$os;%)8H4+%vKc2?>? z(yfA-=3~ZQXH`{a^aP%8II7yuimWt{_6w=1YR;P{y3%Z;m&qOC4p~b!;7MWEHH@mt zUhFhCRMztU0Dx#gtZSCj!Qt&2!y?KCB1U(G;EEXYmSW6C=E3TD$sbDQ^-Wh$*0scj z&&Kvv@rjH!R@QBD4jHhjN+8_FjFZ=zi&E4yuMWp|;gw76V&;30m-4;C#J*w^h~wM1 z<1BO6C4H;6@P~yV(oru3gnB*0#!bWAIi6V4l2$fj`>&90W?_-I05~-oL31_zeE$H* zZMW{H{d~X7-m;E80WB@1YyEFjalHQk6rmAFchCH@QI=I+SeDujTBN+T`$g4@YawLS z{O7r}w28@z*eYiQK>@aa21x^R^dNH;&bNPhEp+VwN`iQ|4MO!F%8ESgR$uw(&7Y7C zNWtz+SG0=G374NQODlS>;YIi^e<)oe_t=Wr4{}D zopO(o4Bpk+TFy));_1$k)6)n003r|Z9hz&S?x60?-uGb*>kJLLc9f}^EcqP z$}+taZXkLSo6^AW%W?J>I+utc)8a?CmQ`5QJnRcAe zO`yqZ;nj)eyuFoKPehrAM<@Py#xwHv<(sucrm0EIYuJ?sJwLDMjE7Wk@@w~Sm@VNe z4xew>$$_wQ{4tC%W7Ue)duO;BWVe=3Pc`-IrpD0j85`SU%vh0tTlY%ba?eoQD(g3} z+9lYFNw&3?R=8&fvpgJsv%y<*YdMJ(FGd!(zc{XB2i?9jR=pNI~+! z{{XIQmnZwor@cazN{?4>Bx}hS^w;(O0Dx}XE}Yt|ej8bg);c0Imv?}L-yRs;v!@}8 zW;oR3ka6o(BJm}~&AzGPiDiuzCMjm2u^76TZA=0_Qygl?)8$j%xvRz+KZ8~P?r(L_ zw=x$WI@%%q(jRwxw_bLqt!Qd`eD`V`_!94-s7}jlX-yMlrw$AKsL+9>fa2E8A-RIE%yEl1&dxRkoGc zK^{%I@0w13yutbZ0BhcvZ>q&{uj?KswYS@0qFZ@7XzBnsLC3xW^{FD{xy85twrVicYoKBqic6Ol6b4fGq`Cbw`grf zPGg$nlmH+0A`kjiPqf=;zYnyFYo(c=R=Se;%xnXsQU2;KqXc9~aZkHhwEqAMcymOz zQg3wv+LV!zoy{>3K)?6E-Twdrt9~TVWujY5MXjSlr6B{Pas09{{6vkOpGwhh zX>2DgFa2x(006hGYEPo+J|D7S<|wl_b2Ol?_nhwl^%)U_9eArgCze||bnP?AkjH6d z97L%N`@%fQ0UdG+{*{YmtIy(3it@n|ul8;A_s<)B)wYlLhoKQCU&E1F2T{>=NUr=_ zs>vh>*REDLTN2(l`L-y@z)(PEBLQ2Eaal$Zr!Hi#f6MyvB5;$uSEuXx0$W=PIkijK zO^%kY`^mz7@<#)RqDK4A>(DP#>0ExPt={;R(@(L0T4}Kz>1>aYo}1-=<9v5NfCGwk z_MxeG*Hl(@@~t3WE*KzR-P@;RMecd}lgX`!ptP`$$+HXPFD3f9KkfP-!n zm|$_YA45_bgZ;n{u&B+!ZVR8IQJ`6&z>J0SN|Wk1qUz4hc)Y8IOM}>_%eUy&ADPJI zPy4M$oV!S9@=x&*x~54oqb9XEjYmnA#f;HzdvJbPILi8G{At#joGInATgH>F?b9yr z-UI#R&IUbHeREwaPU?W67|(Nq_#A&a<+U3{M`;mEMGKX^^0q@207JMNfT*Zju`Y`;S4a? z{{U<7tz+0A*D<4`fqkP`2akK{+MIirI3_7rWP4$k^{(0NMZNn@o8xIiZbQKjyU-rJ z!TdXlZ%DWBSpYoociad%y~7LXJtLkuRYp2sz!Es+c$jQ?5}b^A>N@k(7|l) z6v@6d%SO2_{EYM;FJHWP9_FPwjDup>S%^%CGND>k^Lh@QI`ryJDO0j-%c-SA?e36T z>DO9)92PZu5Z1O?qPxPy|_g4{H+=#8Ef-xv$^Zs3vedh?iz4;jD9qTht zy4JKSzaw2&f@ZhcEx}YSu0bKn9*hA5AEiLI32_v1#*w^aZSw;B<#UnGRblr}&{Qgu zO{Jm)qa>c}*Rs@ud2_rnK1)Y%o?ddSxhK#DKf~&$rcFd;Mcouml10yy?&GHHsr)`* zLG?AAcQwGcYi}`@-r^w{aoBsD5jO;Y3u+B=(hnPydZ${<{A!7GiY zg*o8oIjh3yY+*L`sdWm%%*z~;hLu|wD0029ez;TbRiTC2>V2TfzC-0>Ve@}@I`;>@ zI%juk&bgVIM2g~h%yz+JGhlg<0De~^qYimK$KIi?W*Q}-vlZ=)#k_A6vc~d1&W`1S zFz?9+JpuM6wQXmPSj^F+(cGvnEOGqERQ#lldf}Vq>-pCoBw)bnDsS@{yvK-x$l+w$ z)muG&g09=VkV2A0{*I)<~lBZWST&XP=g6 zj$$jFidUx3n0&pnQna@=G9n{R9barW5#e^_x-j+0z|R%VHl)lJdwC{Rm~NG13R63Y zBL~p8L)++T-JYtc1eWqg43WrzytvpiFC6v3A3<5uP>QtUV6)yV(V7kSKwm$Y`7XCW zEQ(3SHw-8aIRS<IawY5$}l1 z<}-urI-Z!#Iz6Db_JYahJbUftE9IU40Nx$K5nO=$vsVT^J};FcZ%|Ex3Lk%VbkKlF1xo!ymw;3TOBeJ zj0)(cwPAF&`m7Lu*x1+T@vPrB-Ex>X&I!*4Z+ulYvW87YTd9&ed)W@{uBDPOkKsI% z$M=R*3}U(MMand-Wfl4F_me79)=}2sXa4{aeLCkTaeEAq2=8qnK@4`* z_e{;VaZs{t`=R%4bLo-QjXPhmOKk@7SGx1Br9#@3;f0ns$y2fv4UOK2ql^Y)&|{tt zmTOBXJN%Z*?%%u7Zp7fy^gNfgE2$SSGySGvCFIdUt8Waf8aPw{0e0z}c2U>gHOq1b zPTd7|mt#$W$~BFVpoB&XtYM7Pu=(4M`DTi7j_e0sYt+Qb)?LYwL~0Y?y4&1OJ<2X* zXx)m$0RI4IzhF8JdRIYhtKC{hCETyQqgdlHy8vW%`=O3LRZt5N>T8!rxweu!`C}0! z!nBbzpYqW`+sFGmxX=40p?iS#(ZX4>&eO_UlFZ!+A58xM-&Kt0`%NChjJZ+i5$X$d z0?TWI88`<7w{gZtP7lqDe=1{%CHXO(%onE}dFQG8g>af~qdlYU3{(yaV?M!O{gKJr z-n(rtQI7HkmNBtXH*>q?10Tiy?<4dbab2F>s%{eBqcW8>erS)E&P8`vOKl%4Z}OMt z3NU%I>rnYw$_P0=3EJ7L7Mw7*4s>ht+ zLgn(#~)FMXjmwxQ5aqA7(ST ziDf4|9Jg#@xp5T|x{v4mKfux{$!~w_`s!j{T0OO_s>uHUyNsEkl0}8vpkLkNcVD6U zS2ZKRF7Z9Rsy;AP*{3V-!8t#Tb?f1|uVjuJ%h{w3#UQt`K=R7t8BD0!7x;-e?M;(I z@UMp=OS`E>;AtH&uk)%1j2OMbDNn6_Ei!w`pPZs*Riypj>+=5qGj#~VTi^QryBjtz>H0{L4-aU!t8}OqYxXMFFm7D8 z%t2i3Y!r<_>`hP$uC1guI(3|a;5OkU^wF6kQ~Ye8jf3Vm1ab3L47!e%mv;J;(&`ex zertO#Id>smH*e%0o00PJL9MGjGfjJ=r&#D3uB)hBMjqm8>w|70i{)%?QGt(_a})j* zZlBtEjmx6b`uW_Q?e+avRUlCqQrD+EDSaU7K-QJEF*3>pjD!Z24%udm?CsT|H(fcgzWFJ#& z68ADFi%;{FH1Q_x5VtKnihgFmA&P;M-o2&OgccFQrNeBRO{2$h_YV`tYj3%CVPna8 zW_;vlB||Tu%C)j-?z*nKbk8u0Y!gXpF#<3ETiaV6LzNgT>yS3!a%%}YyTAB9*Yu4d zx-Y~0{{XK--L1{er)PbrTHnoKrMzXAQMpes=H-J;hB$^LP{9DkPiDs{D-BNCPYZZn z>+BYB80FQq1SBwtmvKAs^GM71_22Z00q$jZVwPLGvn(zDg2HH}RD@?rWGbPxj^UMzs1 z)?@P90K@l_Wbnh60k{AR{M9AyyQM*WsOomo-s;ynd05j>F_RiQDNqJEc@NC}HjMB% z29tc=Z>d!6y3ut0zZ-A)o`vC07Qv`n>6%WZdkbr_Y4F`TsN3dlT%Ha-Y+#l5t?h3@ zwDAU!abb33Rhl+WHY2suaKS3wzo_8xUoqcYe`88@%}amw#mcMMTgdFM4;U9R^N@R@ z^(W>$U>|t+apO6>Md9lqt0;?7aOe)*t2vC}vHsUWRC?rh$5kiNdw*Jg!8z(i&hh@d z&m!@MhV>r{HMfRr>;%*5@~4}16hb7s+l7uK#|yWIZ1rQFf~h{8VXf-cJ{q^ZN7A&I zO3N~XmfI$7W(TLtcwF`>J6F=0#+PfX>F;H97$ks&k~6jABoVn=_)ZBK_s4qqzs24e zyZC*j-04xXNhYHycE42Gt!)+=A|LJU%dr0dWpTw+<7c5Ul51b<{v4sE-kndzdY_6l z8*eVlNFQkXF&GL{rO%<2{|UnfbRq9^zm43Xs3> zGwD%lLh{o_()>J%((0OwiGO^*#?r_(+r)jso!Iq%YSg%}hf(o{li^5iai{4@ZSE#g zLbw~A?q8w9uh1<%pVr8C>+}BrFY_SQEuT*Co`lEq~1r*Jb!`=^ueZNUEkyH3#cMb*43z50A}d1OLYqN^5Ib&0DW6w{OYfQ?q-4=V_j!0E}vxnTNA;!or)`caAZ05s_AU{ zXTzJ1DknEO=AxFV7+;y8g9Ly&l4Nxl^;*}Mufxv<+u3S15nO9Jg~MK?G7!mcuxvvr zjvayCb{+#Be(osWZ$gviYTm!){dt5o66rn}*1TDI>X)-xn`>Cy1(l7@9&0GZ+^Wfs z?|t9^>*b|bMjR;!PTkn!KAEpU)+D*O@Me=0{lm$1s!uktc_hbkGUv*RC;PZoKEbn& zDuinb+qJ7Aq-9J>90C;MaE?#QD&z1u>$NqbW_2W>lvce=%?AAzRE8lM%Ed>MGwqGr zJQaW8Z}+(Ou7zIPP@UDHXs=6h3EIoojC=9Uaa@L_q&>5|mgy@%5nPAHa?-1+>jhRe?wKvuQ80Gn)gnt%Q58{Nh6KNmd?}u6nV${>-ka{+xx<=nG?#z zYiHPbUY@mY$EOb_PdgPqC8}O=CkH=|G`>G?^yx{d=C=ksb**0y{{YBPG)Pqt!)8AYEoop5g+C%sr*RJxA5817l<5lpQSLc zIpB}NoK`W{^2IcwoZ$XuyY)QH2H+036q)|(6HfB;>G*R&k7?+Bv;f|{LjDxH2V+fO z=e`H9sLtWnBkCx)Tg;Hfo}hh6r4Wy&(Bl;tcjE^?LsEIG9432-JeXIF?7dH-kK!~r z#Vtn~a+9(wwZ8QN+jD<=c;oRss{>Nh?CsB&Z4`6dFD7C>MLxr$eN97utlVm>EbOB4 zt^2Cr89&-{R%~?JYc|ZTlHBzi{onUN>-g6NM&)z9Dm0e5vmfl2H%IMP49(^f^A^h! z{(DtNjo_2h9Sv<=#c&zEMH&7B-`pCa(v0nq8GV|$l}TK#cYB%Qm$;-9S6F{{U$9Jt{e@$urj=)QlHs#!2r?I1R--95yP%3jYAY<8pdur_!VdKKkL) zoac|un;pB&C`1`0O9AVO7tDoLRV)>OUEPnUq}qB9b4tz`3Q0Y=RhkjzVVH6UTxX7* zk6zWe46ms8XT%o-hKz`$RL?Q8xH!*K^0DXIx!7)4C^4wq7&9RN?#J``S3{z~Z~d=m zBY=xNOMS6pAZ!^EX+wZ9hT6bmy>rxsuO_xC3tM+9NiEi&a|f9wrPVdsks?V7iUQ z+jn~#t7iSwbB)_nADIE}7uV5;0;Z3{9v`^BPxwk@ytTZ#g&rt$rBaKW;GC)Xn}e}< z8@l=>E=xIRTq5T$bmZedXt2<4mv-xWg|m}s=Y|ZQp=#?^NiSn-td@8}81^5Rx%uEtWak6gwTx5L) z1CkF4es1J|0QRD#PKc<-t1=902{!>YqkxBw2R;7)PHV65?}=`FJ8N=~+QlMF<9naH zant7PL9Td43}x^)=Mti}a(z|1iga?hMRiiD$7`$cwTG|x*CelVbnJ~k9cof|pTnAJ z;wQefmPU41LYWh29JYRD&UoXcdDfk%wS=Y1A0@eT13SGx{dL$y_JYzvM;&6AuN`_f zqXXOWsu!Miin3}NWvVUbm`mL--0=SZwGy5e*MBTC#@gOGt23Hv(&j~(72s&rc%5V< zua@XlzIDp0e{_#u_Nu!*)y#6rRb#eT)r5%<{{TfLg8bvJ-u^D9KJ}g9>&c|Ln%;RC z+G1o~$0WZ~&%J89wClbMn20gR0Qs_c?TKy={FX4B-5NjkFaS++64jPcWV zOdc~;%%r{IHX8CzJdycGmm6Pe#=tKtoUqCC$od+PTHDOR-e{y)vab_F+n5eRE>C~c zp{&U6?ouDKNjhAi%fS?q7@gap+`Ti`pK8~#)nT`Z8E+@Fm0jWT{oC1a8+5$0GpD(Jz|F6#oEpg(RGP>fN%xv$&2=wNEJ6HnfTyA7BD1e!lgcjxBwL6{BauNPzv85F2sGPyUKL5MZI|ZB$^aydm*vPk*VyCR z-z7Og^F?XfsM_fN08?Tct8I&IG;+=*$j6!&e;7^N9FdcP58>XaNMttu0AxtK$ug@P z;g0lW$tt+uH&)5{w&&WW7Yr0zx?&j^NY2p!(gV{xXM^}0(^@;hGFjf-#ugMPt2E1q zgAtH0N~34FJbi1aD7&<+e(ark{zOnkKA&?1yULc5TEN9#LZ~q(&Ceg*&&$bT+z(1I zPWqjl)$P^ni)4^VByh`?j!6)W+m*AN@B;q48qix=FSgpH^a@p zw?Mf#6>j2v4%{-qr)0q+h~@v2L>}FvEfm(pSinkNIm}m3e&pN>oYypq;t4sz%0x_Sk*!0df@VURqKDpC`S_#qZm4lX;N3@14=efZwcprPB(N3o_(_}|1 zZ>nF38`|k{VtdP(ri2zNZo4Bv$I9O+Y!1CLN$Kla+I-fwx<#&1;?mY;^L)pZaLv30 zkmXAdHju0ANX=2dipNCJO4-E)?ZZ4^s;d!dqy`M#3%iqy<7gc^ii*kZ_1WP|h~~Mw zw#XL1794P)SGLmNdQz9Y3-l={!R@KJZFTnBwTjBiZs1YXrET+WaJzCbwMgTE(>bk3 zb=#|dv|6Eq%$gr9=8G&=H6$okY?WNGz+8Vi#`|2FKC32~Yhot7iWGs%rs-v#KtYN# zjDd#Rk^RzZQ%-w0FYg)&h0dUkVMRvXkhmRJ1LnZq19aV%>?o)2KYRWhZ%>Bb<^KQ# z(7CwOWs=r=o2!ZEkTZVtZo;VGxfnc*@t&CVqRQ^uMu{fZVh}+zg`;`m-{#5}qpcBdJ?M~XuNBaw0rM{sw8{)UO(xNRCOg?7XTkmp7CvRSz@u7Ko z@W9q~ut_u%NQKtHHi^}6r~>2Vi~*6Jr<3VbZWj?pEHK-N$W>yGB&c{JneIq*nWDiQGb)kFpH{&mu74`ZiLV`*C(|vZiNS9sNR^e}91ZQx z?tqX-Lsw?m30^lI_9~q#}vrd2@H&p-OAYl6ge&s9Fx;N#}yThn6gYKw(zyqt+w%&VGH-GC=(z>_&1^!>3 zsi6h-sjZ@3>Gt;k#!6V~`jh<5hb=Y2+l&K&gX6Ug0Z&Iz(p(-cI0NNOR3&!SJyWad75U8Xv#HBURg15Ha=&!iPQstz^)Sm zzj2xDG~}~GaI1S1`oZ(1ykuB!u9@2_Zwh3XR|;91(Dmcxn_GQ9uj~AR{k8nR;Qs(x ztw%uvKK(;mwu08e)fzi}IoQT5p>xdB5tcHz1Oyply!6S&hO>Ou@^?;fdhH>4 z@W_RmS<4QMsO{5cg@xv^A~ATP2Wu>3{{YKJlY8WzcV`$;N0@(K*ZF<^ZM=J%U20Uc z)Kcrjvck~YNh1c1+SyOfG@C}y%)8GS3NXEDEjWX!Noj3yAdbyh?=}0U`PrMaMS%;v zfS`=F0W_DZ9NK5unn<)8eL_Hx*xTE)UP%OCZFbKiAxX<&RaZDUs~Ti7>hoFo)}Lq4 ztgZ}}$mB?G9T(+bINW!uJClwV8NtP5+lx#3{=A8nzt{Em?3Tv0-k&x90EX{l)5LMz z%?jJeI07?p`qHyyMx0%9fC{dS(;g< zjXAoT7%Tz%u_ExK^ciD}@va_^No+SRRdv7l{-vJ+YPNTNJ!_qMMGJWps@E>-`={HS z{NRSjQn+rrtEj*6b?w9}tZJHK={9!EX47qA4IGzqKQX$V4#fv^KSPiQ-mD)FXgUS9 zwQZ(JYd?u~XKc$2n-Obl2VAhhn;d7Ndz$HWTN?|rbE|9e*xgKk#}=Jz!In7!g~PIt zqiTS5sUstk#cNtr?yj`{wkC{b?)v^eKHg`hd^+*x{3Uudy6OlZ)LC0&9un*(13Y8c zpO|(z#%rO~Y^=3=HnG&L5_lYf5=F?|4ntSW_Oo7VTI{;q_wA?Y`c#Ti?gR_uMx*40 z5IEh1`BZU&#klM3{{RoZBWeCPf=fxjx7DHqn2G-YTAiJLsO){Jr_Cj6{{UT?lZsb= z*U0lv5qNt40KncL)ci@Q+^z4<^668yH_LM%Bj#Lj<*r+>Bqlq7$Z1a|mEe6(PMch~wAC%{H0?s#JBvBS*&~o-5!mOk?hk74uOECn)vfh? z4@=YIEu&;u*~+UtYG5l|}Nx$g#Sb<)lV- zGEDwpKlIa<2lL4MYhot`J-`B`aL9Swo}BkypGw|>va>mH58k>b`^d8>ZOO!g9RC2d zpXXVcmVhMPHK0^}r8i+)j^3Zjw4)_%7!n<1Ooq>0zNfjt{VLK45+w|NQNrh*Ju9Ng z=+1Q~7j$t}x7Rvs%%Bm^&l8Lf{;{r>()ugd_d?hpe9@k(_B^gE8i)@BVxKodsL|pw?RDL{s>JxA?znpZd+FW_n zEdFd}bm{)^>sa10B;LumJxm6aDnCP#W7eBWc{G*OK=E4I2mF6b zcWuO9`!K9}+VJSj)c*kW%zx-rQ23FZUxAyRXFc2MDUT}#>G@TSiaugN{ArRQVYm#h zs68vH?BUR0mjj_b`J;F}*?zQv2hjWUpfa!C!K0c;IaAPhW*zqrdX_)lI7NuFvA_eI znuschTIOe1EtKHO{WIvJ_>ZvsYaZ)cn&7pbmmHDC{4)>rl6_BM>?&nm+8RR*%(1*_If&@4|9i; zs7}NDYjWStRF~%w#f`6!c^}>4v6`Hs-Sst9BOh{_)A14Xm~_iEc)&=Ra+1DBo9~Qd z{{SkWmxyH8A&zX1?~nlR)OPDq-$?#vnkAX{e-bX?$4`{@9@Rximw%M1&U&@8`Eq_L zde$m6RBYw=e@s@3cOO34V#<-_1Cg-q3O@tK`RQ5jz$>{~ea1ygDF#ih7DxX8ZG4%3 z!2Y!ik_g9_wC6b~Ds5r-`c^Zh)45hMl+ynI5Tq2Nt#q505q#)@EUV=&B>HyzO*U7z z@&>@)Xf2(}eqFwVV}|^3P!o;HScN>`ccz9r3Wei2&w7gk4ZGU~f}Z5mf>wAEA_vG$ z4}2cqN`0JV?xR5uo=N`zSI^Du#X<);=soG*TL`shD2nMNU^b4VjN{&{XUl}Im6I9C zSNH5jdF@o;``bqDc);&k7M8p9MYeMgCAR$Brhg8g)^bYs0tvLqqlz1dRLcaYU0DuD z+xLO=EBv{xvqIBv{5=Ben#_M>werN4sVQC8GBCmooR$9m;yNfj>pI%n@M=+M_8w@G z=HXx}2QkT!f-%#Nz>U@1{7dk;i%7C-3x<>Tj`AZ62V{&XDskuz7tS+d) zT-nEH*G3&k+svX%c~nOsf4JZ_6u9>U0^NuQ*0%ITmd96(14V9QStAqNwiC-kj4|uD zD+M?l;Yh9!^BPV3B${TLpj6Fd5R`#c@8~81!`$3ur*Y~PM*H;*@L|&jo#JUL|2Z=SEPZ5E*zilAkEb^{xARxb5tjp6TRG`#~#=kL%vG zn`vlVE!5GmSQ(-76tAwv>(3ub^$i2Xl3Lp>id!I$tH$y)nLfC3e@|NT2pEf5t|o*a zx0J^)kCt9O@c@0_df(FTZC=q;o(rg(4tsRzGi{L? z&RjGyM8E2eI1CO+{{VWfjit7na=e4n{{YoiZme3>ytRZw8zI7=z+S(19lF&oFLY?w z49LGMnZeFG;;)%*h3aMA>1hqrcUph$E(3h1KI1U#5Ae6>eX8tmgqBQ7U6~X}ae&Q^ z8`zu?_*Jx#lMbN9#|%`CjZA2z2=YX#vE@;{d*OXYLrJx{DmHo*V@ri}k{!UnHnDCt zu18bYgOARnwYLjz8nA34yOd+Foxd?$_iew*oOewOl18cW9kMbE4F3Rh{$Hgs*+wLd zNpi9hV(4&3P<^?qWSz7hQ*PP?x!rAU;&f#Naq^Zp-Gv8@$B&o3(~h-%!6Ke$+<9!v z2ITWF$EVGh&-%a@s2?^kMNiMjT3|L+2Y#e}S?A4ad#joK~DFN1rU4 zg_P2dFrtR?PF{VR>$h5C!=plgv|t zpYI0!N1?2}R>JnqI3}^4OPJO@tdgAWT<0Ey@<**~Tt?DJ?-Fm4GGu*;w`SJivD|d} zQEM{`shVhGZO$c?7+{Q!aHqHU)yJ7nd#h|iR@mj+T9;40F#Vn{vXCAnMA$-}LH9pj zO6j4#u={*3Y+#1$XBL6Zb`C%++&1CJA5OHheTGJfu5Dtu@|$oIF7V)F4B++0xanBe z*32)+pH8y6vRMMGw&=(aM*t9{uwH$8*0NEbK)-XUv%0p7#SE<|v~s`^fZ95GHmTe( z(X+>1#PR6HbUJK^wy}A!MKebdMhTsc09JhGIVv{er*3J2c|ODh7LlSEH+ix`L1G6g z820ts>&-3uUoDh2w|6nXq8*Vmt+qHpQb9X&j=X>}a1SKZRgRWje9D9VtaF#v7t(l^ zYu$5O5eUZnm1h%7fmAn@<*&*}&d@Ryd*cUHR4`?U+q8!4aCbLc z`&3iI<%puTCP`IGO|<>hED=>t_qk^+oM8J_wzF48ZeQ0?x=aEGxV+S+aWe*KW%EYm zUqw0P*p58_>7J9tufryVb8l^H4wG!s#w3>9o%3UyHc2G>p!LR2L&h@ge$5@TKv>)^ z=8{1l@$rQR*b)B#j~bHVD{EG;ZB2GTv`h9#CS0-;&PwOYC*Pb?_G(gh2)7--ubHZv zjo`SnYk6iCFsNc&^S_)f@8JYv@Fug|$$sRE?ugHGIkB&zZ4F7{_9NzzS-b zg1`0iDb1}}x8a>G+fTK;ymxuuf086ag+>9G{G7A5%06z254f(WBeTAaXkgSOjveM! zhT(#%;cy65$!6zh!0dVfToPZsrR+#c7r4vFBW#XX?ilWJ4Rx_f62>UnNX@&+y{%!$ zcE)h*NWkOnsm}zSD>j^++trCkv2mzekf-fX2EZ6)RV~MSXP`ZYYRkBFXroDt^0dI? zC|`2(^JKS9Ju%v>OEHm5@j(i+BBVjTdAFavw*Y7PaaHZCC)D*>Ems!|tI9@5`G;S- z$=lNdjGE_AN>*Vfav!in*1^p@??B}<;j^2F{E^jT-vuMz~tj+hbS2!(#qdgQK#8$K^>%s3<_r#h`outUd>I;Zxm7<={Ns3E#ySf-G&nZ4(jX_WV>QAQ?M(0M; zu1)M$+TNq7+^I&?p32+J4ioOcvVVNBVop1D+s>VxnXvL_8g7x{*&(+lPkY&3Yq%lF zd1hqE8@86fJ6j}jF;#fmU+enxIum!%YyEn^$gOz~i*MoGG!}jy7twvAQq2O8}S5Uw;iB-F1B zq6poffoGmkab-5YaPmoeGhhi?-4uYKN!kb)>yGr1UO}k~o6N(gY3t4Yv#u5v(#V{w zFh`xBZa$5H&nJp$^k3KY^6VD=ZGW#X_&;%N8atM>)FhdVu_6A>)HUm@`P`P0;yZO2 zz+AGHIQzK9Gg3o5GTU5T542io_p81)WG7S9gAf{3Bb8RlMxbC}$pe9y>|l~>tBI~; zx74oP#lEYl->G|53_%vQ>C}>T0&&MV#{#6czPi+HuP!vZ_+xqB$xUNZJ6`JGlE1he z6(dt83J!7xe(|Y)*Y*8+6~6xf*Y)#$nvz^7mMFB_xiFD3B$@`CxsagwV1cGmGsa|W zl5w6u1k_jhyfzU|)8g)HIgzcb?eNxivK2l|5r6@al|gfXjO{*|XhgNqe#>Kibc-9X zNSgVN+V7WcX1fFLWFp}gf%6m78A~R-irP5zlNO6-FfUe(D7xO*?O#c9R*>>$X zz$`%RO>6ysU+3s1t^I$N{(U`k6YSna#LcXQzt%6HWR@>HptFb#(s`0~&eg#K9mY;g zR%T~%%G;9?jB)LO}iF7-Wf z>T7Sb5ORE_RnF7$lhd%tsWEA0VMK<;+vWM)pF6UbF>puSn#Uex*h-B20if=dE1SykVzXeTTz3HIr%!r;jBs zH38$?n!%rleqr?3}Xz( z_&^@Sn|D43X=1mv6E2;i*|1q}u2*E2(qOa68Xvp|3ZUZvjP*6w=^8JG{28U)U#*Su zNKWTEjEW>^5U60t6m5*VFUmP`KfBkOwz1c|b>agh)wSN8pf>R(+|e>-HV1{z-ecKU zKKaE*J976n>fiPF3zaDQ`gTiS>iy1f?e(7(`3d3eCV2E~8H5(FtEd6nt2-QSbJd%$ z>w${ava!^D-v@_$8eE$hmg4^aQjhm)^9+Fd+kqN#6d%3G!QEZPr(>*ZR}g6a7SkfN z(yYMpuWut*(KsLK*bU9w?q`w7$u(<9(*}`i4~2C3mU(Wj8LxFqf#vR(83?1hal!&f zUCeQwPiW+{UW>n*7cRT{`FSrtyzVqV6xwPx^61k~8Pp)i4Z@3+Uaq70j@)*w+siw9 z%PFlVh&&S^Nl=~{NWliYn^m7g@V=jPx?-2sA_(GtD7@@G?*)GDTOVBGrFs^Lt4}m` z+J3VovTF@)CS{9oSy#I#U_C}hIL#ENc-mTutv*+ErN>jKJofY2 z{ewu1MTN2RK35n~wF00aNXAR^&o#kl7OSOrKUuyDwrj^tG9&z0&zUg?{m~!&^t(NN_2Wl5b0&Pa(J0!>a^Ne|t5PjI7{3({`ExZ$}OCpTwHxbyW^o<9ZCO;Dx z{3?GCqS!;HX$%M!7NE`l09~X$WS^Eo{uFqD1+A8mq>xwa)`Dr2_L^_rH{_)k@U7D3 z`M=1@{EyLoFa96!JnqIOi%>AX-d4w4bU*&8-NxB6gO9wsPp@zD=(S5xgu|%Ca6gQ8 z_WuC&R*neC#&)q82cZ7|IIUo;+Ay2Fn1pMr`Ou!%EKSNDiEQ|~N7D#g?x$`3K zy;*<8qOsNIv%(YR#r@VikD;!Oer(}#ze=NbqeXLy;H>$vG4fyLdKh@9(Ve=ObkWwFX$M;BIl{KQJi%@*bI;17^{io9 zx*rd4ah?~S&Z;|+SK((wT}NY|T;_s)U%RyW=Btc)5%^VzVlqV17f9e=zbm#CfBOEy zx0B^pIb$AYWJdAK?z>pAV})FDPo+n7b#0^r_OK?q9W9Z#U-#R$`=g(wRle~IPzJWq zgn+lo*KRQ*`h*|AKVN#yw9~Tt?s+|Se~m}9l(a_P$`tjUudwH9{d(igng0NEoCAy^ z5ALX~ds|4YBij=4dbW9Ld~6i}L4Y^_4k=8y;+$flXHs&T)OaB0a{hHHx}xzm5BGjy z`qgd!0PN%OsTF^BoW?WLBbv7)k&=d?G~z}8%u&9hpU$hyg@E(zmUZm$@Z<15(9@)i z{IVfttEuU=2t<{-(6{Mega=hbn@G4*65|TCz%5lQ;ZLrcQl%pHS8S89(VntE}!D~bzL$=go~Kk z2bcH1yOSyl40C`;KH&P*`21aUs)U1Cx|Ou}8c#JNK&E54wv7J(d4^XffB`(?724hW z4ArhARvNCKh0g{zUZ-|Fu-QM2RF~jhy|8&VIy$&exi2m=-Lbc?w?SD{io;!azJQ0OS!|rLToNMQoSod_Ss*ZsCe(U+iuKQpWk`jt&7I@G65* z@Xo9f&6`2gNgJ*i1)QHQ^;RqQe}#jLf3;k5gOy8azu~{7@-~JlNm@-c=6b|a_(#MR zIEv-1!~DR8%EsJs4?9mCdLH@4D=PcI-em5pEKP4P4zkAB4B6zN90Tfv^PiX7&&9s2 zrAc!Qg@yDY*7f91^jE}?vHQHQHPYMan!cBPbRmZFB}G|QXceR0rB$|UE47Hp9lBEH zn)Eh}N;B5>V|7mry@6$Uo9yg%eW6#FTR$%G>D#t{omA4U8+=6q$gA@K(XsqXo~m=y zXWqMOU2f-6l0kEAZ+e?VZbh$`COZPWXYf9_sH|+PMZef$yme??;v30!ym|iXU>pzs z09`L`)3;MqO8aZ+slm%-JT~&Qah8r{+QnBpSq^@0LJlgt7N2aC#~LY_afMdpm+t@v zQ;vF9rkyR|on&=iHcW-spvOMn;=trKGxvQv)QHOkwysR_E z^RhTdlYC_&f0nph(t zmL!!~)v`H_op4tq9gI2=$y2(8ElP1RFD%+0E)OURxW9Id%18<4V{$_;b#>}1E=W^q z<-pp)GLz&qC#Da4a6dZf8q|sK(I=A;XOuJ&Dny|C+m%k_UD!N*!@Kx#nmV)--dqga zuH;NcZl>M4oGgBBFyHL4BXZ-PO`|0AW-^adW8q2=eC1+G6-3+>K>1i2k_k85L$k?t zbu028FU;+O=y@FXsBR&i30@|GSj26yfjf+6I3pSL;+n0on3Rr0eYi;%8~r+W=M}Bv zC8N2P+ml|ycQvGmXMAmgD#t1y+)89<^yqA^RR-Av zco@mm_cx)jErsCss~@Z0Q`L_+t_Zf z{q#u{tk-Q3zmPGCG*l`QJbcgHj@x7>66V@5!Sguzq<%`^ zVR-&5)?|@hYo<*~@Z8*emfVz*?T#2LB85~PIdOs0tzwmx=JXC!w<=s-neVRu0JB(~ z!DCro9fhM%6$Uawo(CNBQA=Pfw7<0KIceOff)_#yD|5Ris^gsZ$g9?{O80i~yrrFn zKIk94bG32oxzFcKy40q>yq%1VbM|n^lD`VvkPo*!^U1&+D&;n%%Mp1Qw-)kF;pBfK zZ|=f5!l_=NasG2w74CeXEhI;cSLV2p##sLVOnOp%mPWP`&l=pwT%>Cl##CoK^uRyP znR6U5!3$l*Cn{ASh>QR^Kc9Mh$><{N3o}lu1oJ3usM|)Iir?Oy5_Ot7WCt#XjW!TEXR{vf>I52iX+ zQj)#f+xq^sH;Q{d*Y*DZCRC2@>N}e?n)XPeNkGzc4GopcrhZqEfht!#p1Hxv=b1u{ zCRuLeiFE5o~V+4^BA0N2funFHGLQL z{d$&_^#1^Z{{R7f%k?3S39P5QzY{v&v}NzL{bt;%K^%japn>zbw&W4@1df$%6q{0- zE6Z5rzIfwVtY1-s>~>mu0fe2CBr`K*lwk3a6b@-2vq+XZi`zLZ(#Y?IeR}uKmke>| zwZ8`>fDv2xy)l~5k4}5bm<8Rv< zpY`AT{R;BmJf$Gh=8E<0ZPZ_Dcd;yXma)h;J1hX;$jW+s?tLoVwV$5x-FZ=6{gKQ_ zvRk`BZyx}~Z33^(`%0;k7#ojFRj;-%yEaj3Ci3YVt=zF7X?HeAW!i!$*}9fFDyTD> z&DT6jr+9kW=`|aJa@*R`VYQw{c@!?~vrNZvKQ>O>51aj*RNc2|kC(so{e8rjjbo&_ zH>Q14#8)^;V}=x-X;8l?sCO(;EPstF6n2X6x=s@-^tSAt6$4JyJrElX93Ic{Oo8NAYrq(GU;Wg$Loy#_(9u73^q zgHrJOMXdO&!s1n1)-KW*qE!Scg&1iLNp48MUs};FNiJhQXq&Wu1%LASX`#;D=$dY& zso2A+_@Om@Gfz~3qqLRfy1Sl3_i|KVGj0QtLa%P4q1Cj_Lt2{SMr~T(PLlREhDw#M;K9UAN!dL2gpj)ld4kWRRjSLJNG^J%%w<+j{i=wYH(9?|pv%0G0IoH6ZZq zmxFvAcXcM8sa{{|FhmW-?~tnAoRi7v zSESOfJW+7^ZO)@Dl5S}=0j63>w{9?~ZY9P}4mby?-~mx7URU4O`u_liDk(xX+w=S1 z{0}09rl+S1skGk;S?XH8szQCL?%9R3YNPI{m9kPGKIl?9btjtMX`q3Pxkl8r7*$y; ztVi064%j9~mQ%D6Mi+VD4u?3YY^?l0qiB<978erP>F@w&x;go`4gr<;EL0AIn%Pj4+r>q?G41OBVAsGjx@;`mPc+|cL$TZXy5_GO_O#&ue`MWzZdCaYCzg; zfV#PZH`fJAM{#|f?hnX~D`2qC@Sxk*gIV9&*E)uoJeqP@Nq=E&vRUDXLu_Sd`Jq`K z2V(VX9Gr913Zou_q4;J;)HUnN?Qd7P9$4OnNeOOo3_#^K0oNtA{{VM2ZsYAY&1-ag zIV1+&a4ewJE+a@`;|4~Be&VPHXdsetfu3nw=7-FczhCnI0EhIspW(j_>zX~m)%D$4 z+B=D3D{#=s=1S0g{isv`o&Id}&NEqK#Cp$+waH-c^G!K``S8r>5L=w!9JkyZbIwn_ zbSbApsx`p9zSJ+B!mO8It-Kb)JffEjS(`aL7RKMZntT@a8We4!_&-aU=Ic^oWVvZt zNYMS=%-A3S&+AsK-TvkMf5Q;mlF|CNZ|n6tT|2}U_j;6?OWnrz_R0%w8&-ANGRi%P z}qkibl$$+qp{=axunxqvnEv1w37FKtxglg9t9EWiK$03K9e-Q zW$Q|JJ-TGGl7EO02p#fqTu!Gg{Ba})Ud61VN^8KzvRX$=jJ%0iK{{UnjF-8Y4b9|CoqVX6P z%gmmSR(bu}yGObHVuT;@t8s=SB~L26;;K#kp*y*ZLXoyk&VU2jrHEXG9%OI=tAJIB zKaV)CRo6-0$C>qcTFjg}sR4n{u1zC8e~Uk@GBCvc_Eb3grZKnh#U#Qh7#;g~=CoB4 z4`C(He(#r-{c%h(F5ndux3H%9aW4qUezd3+RE_-g>qoSRy@b0G76bu|d(>`7Y!A+Y zuOi`O3J0|?MYW4NmOn}rNcSt)OR;ZFm7={_aJfX4l(5bSiOxr%tPc<%M}c|yV^!W%F)odX{N(uxdY6T zyPxPgQ)Jp300B{rYS<207~_rzrna?g*f;?9<26yKZiKow>RD2FJ!mHftyTWmwK*ql zPx}U=I<2wK&-xmE*T`PME~Vhv$0U9gHM&hS%Mv3qw+aBxe+tX~)wRdp=fBEnPO}k_ zAMFx@7y$^#DhEIhY*Fnzh3pq|UC3@@A1MANQO!YfX738zi!jRULddL8k};4E;yd@O z={1<-c6PXt*@-O74oN*prbnv77y7wb2JP5ffuCGfck(OQd5beHO58k)=(kaCANSDa zog7m%Znou$1>9CC1j)Dk*5pzRX;ocC!)T{iwXK}(xJr}!FG>Ga1meQvtz_zRL9e^2r`NcA6! zk^u4R8e6V98%6;C0OT+0PHk`E)C-rn(OiFdqAQ=xMRgY%th#h2^HQG5+RiRK>7-O2 z@Da!84SDvx@tI>rw9#zixdpcUrtUwG{{Xt3PveiJYfiQ%ZqfXRlpvP2{6EOQC&Ukl zR_NYn@ZJ+4nn^YR^~(S&pVdA#_{AW2BcDV`Gq=sNLm^*qNh^zdo^KFwOAyYl}4 zGd@Q@g?;INk>=AUiM(TQZ!SL3e%S-go?n!G3lKjlcZ>9Cw70W!q}f*B0TYgd{d#>3 zVzKgbS4zcC4*46tza!_)_1X9r;wGWsStk1xg@wY-e|Tk&5)tSebB}J7$TAgTtO}5D zLFx~sYWRBk&rj5W)GQ&rvXpsqMK54~iBC{RsIN*-OPr9uHlK;@_CFdtL8z)dpM+x5`zFY`2zV{ncgjV+Yvv z{c5yOc*nzpoi^!p8)f;nfxM6S1AN2qHOBD@UjtY3(CSdP#;x+X>c8-_JXog<@aFqH zK3lMV_F9f_i#qHCEdKxxwMfo!{{SB&{{V@vHnaGe@JjIO8e6LX4%m*QHu`sBLD@ih9f z>G?nN75+F`Z7Ff#eP{wXR#hV(jdK^?IPli3evSQ)u3D?M($YufEMM&a4oCaGhl<#f zTUmF<1=I>t@1p1B`t`+2ZFamCyM?6bwFh74Q9MLyD@s3zMvS2Dt@!@{m}#W$0Gz)_Py0@w!towZE{K8CW1zdpB?eLAAmXS=~Q0s)k8)MVf&W~ zKV6xs(ceWC%#AG3`En1LJmLxZamQ-qPD@Cg7Eg0hJGdvh5SNnDMmx63+XH{QyBWrF zgZ!zwkb_UPEiyVr?}nZ@!vzJh&4oB+2Q9af7dSjo_M3bCI&`>-TZ?zv(ia)z9vHHo zqx-`IcLt%fvbP5j-(B0vR1NAChqmS8a6EJUYL#1AGF%G&WOtU*v=axK(3O-f(J(uR z$znFXGNf+mtW=&sHDLQZ!6UUm(Fn`8mY7m9Fu@5fst7q;FjfbRrnP6Wzp{0aEp%7B zMPVrN7Ac-GFnRlq2=)3^N13eWXf-QXtjvraHl8xOkNYmU!BNQmFRx19+AWCXB1yBV zTcKFx44aX>w%r*AE47o9IR5VC$8DzyLH0eZkKM%h4+@Q}7qAMe<;x#-jPt=|>rS`Z zA$Vd7Fq43>6=8)Pep8LvB;kVRf~=;cjxv_A%A3Hs`LIi43V3dS9+@YuPw<{C9wuFjlpVNWX~9!?H44SJ0JO&UjI8VvF}zmc~UBV~`|5s!l+|Fdp?F zx$xA;mk`}g9kh<3SX&Iy&p57QEvb9*^W(l#xZ6F`V zxE@=zMGl;oW;B@|IMfs7Pbtb0dIAR`woGm5)*Ec@EWn)L8C-RmW_T{HiHtOW_}qw$g4O zD#x^)+~fF1s=q%bd5IN2{ju{Ho+t$XHmO$xRfx#3M2cYw@!KMQQG)kCJdK4RrKT_B{d6#irO)r zS(z|f*jBVwmQX|HShbut5+KByY#(j7eY|aqzILjNe}zP|gNcow1{YQGOG#YerMwd`&cI|iaB5DfL-QBi&z5&{#jNpuL zI0CNEYjR_BYb&N$&^b*_*3Q-j{w8lM4er?7c|20v$dV+#*|NN>!EPoRGx;jt?Mrbd zDyOpc0G{=edj9}l@c#grX)AwU@ZXTtxNG=6(AQVDcGHiu%W~I3&fNAZ3Y-?_bMrPRO(D3xy+5y)k%3>%_R?b}f#E!MJXNN&=^sD%+?IKcE&M>g7W4p2| zG?KC5Mo7DEiC-TSrQxD-H$do4Dm)XRAl8(C#EY!J1q&6Ik#GWU)IHo zA3VrD*LxnI@*T6y3j&+S`>GIO6+rvWN!mCat0MDVx4N11jU!pLw77*^&08Be<}k(% ze*P6e`Lf+P#aK&t^~mH=bE;iKFZwx^?oE_SgUZDSLR+Xg0ONs?&1_of4X4|^m96EJ zFxp@xv@*#WE=S&($IRsQ!6P{*o+)nb#d~f2e_t=q!M5_Q?2wq|lFnix^R>xtC7$EA z?(LN10(01YIIGR!+grKjlUcpf?-J2NLuD)*E$r7A8;~3mv|s~))a|a5ZR8|PZDnn7 zBq&QTw-LOoSNqS6i`S;^Dw+u-o#Sifo$4Ccf77YjZSrS3VxoRn8@v&>7tU z#&G;b2{;+aJ#Ys#vjx5MvVQ7FZK6C#7Uan4K;=Q(jGS`Bk|`$e?w>SeFYN{TvxePu zDVP)VWA{hXpGv64Tax7!r}h1L>R`obHSBxztBdQ2l{T&Q-MN%)@3owwl7GAhIL2|s zUtKP1Q4A2BPHEkjeap(0CbP$v>?M9Rr=cf3Lj9O)pBa)TOw!+jFV9vNOwbEQ|(nS2%5p z+l+C=M)Ezr`StB@?BSnmcGeU5cm(h;2Ig#t*aWmw2r^mdlldYd)<-f8ua`Cw>OT4 zH6Fr7&px|%R+KjP_VLRZx^;{z2^ra>khvcyQMFVk^dGKkiEkI_7t&1HhluBBm5WDb zZie04J)S_vBiV6W{;lH=58J7_*?Vxx3az=+Wy${lfW`p)S+hme)t3JN*Fyc9>-~N2 z`sj4`+J1?qT-@q9o~g&GMkRw1!_?I&{vqpk;Klv6@kQBS z`4|5A;+v%|sf|iI-9N|tk4*6ok0jDO7Z#(eXs>JH1|lT1nQrA^Y-TMkSme3RR{{ZXN zRn!&6E+JR<1Ddjy))!#;Qb-&7r2^ygYR$&m-}5w6PiuYFBv>P5Kj^War6{#u;kF<5 z$)Ri-BpyxG(r2&$5B(01{{U#lFt<~-I&WX=x}x2gckaI-4WS(UoDcX7C)wo2Oh1m# z{2Jxk2Lu8h}BBX9P#_FKg^0PQFOn|FHgOQ(%welAt(LRG^QE4c@Ojo zYE{%NnQ|^B-P}0~{{XX8i#$Yb`&9Wpj1Tpt@AX9`4(B*APF}R$)S%aC#C-|y)o#Dx zU09TXrird6LK0S1`{VK+_0|dRe3da;T_)AxB;Lh8Ptf(OPlGq{r^BdXYvhTN>Ti*V z1Pp`yYrehK9c|CqX9a@icq-ZaE6J@0^S3AXf5AJ{r=m!}Tg^sH$TYjz7=6HEz{j8? z98x5={#wX%d#HM!EJuJnanZr2fNj-MVvhs%!oz#1M@NV z73RA4#zzPv(BeJwaeETrXZxw2)Stq#q3~w6uU?z)6Y9~zgp`Wf8D{tkD&Z(YyJ~i-SIWOrhkOT z3;f@XXq&|3u4U8hBqfJZMtv(fwWTdDp-`u1%=IrrHLS1(dl~KvW9BfkWO|y0LuCX~ zyw>6f2PD0-q7P0Afsd^?c#N!L&9br|C@O~GK>A{geQelz%r=hpuK z@J(&5#jQV4v7SFR1%_0|4;*fzkM~!ay)^bxLapT@5`X8L=RVo0vFc?aRC^nO0_Wuz z{6+xcn9g!ptnSJlpeug7Q@)n{P@Glu{ErX#pP;6h;xD&bt}U;DbB@xFz4`FpiLN1d zHShi{&>+|JD~n`?@;TS%gUsYCgQo5~@;@5z7gCN@gDWY+DI9~`SFeMs&raP@=hlZL z&uaK*s@Q9qANFRaa-!!?0b*Gk4>BXu89lmj+ZEisy_CD7hD6vf2bArP&c18WbtyER zQVYo5L9wvGNgI^)AC-GsC6Tj+=lf1@w+b35Ohb{3$dNYd^Ju8|HUbRrkXVKDEkfzYnxKq(5h8vAAQn6lRrI*ry+-HF?FTZt{{Ya` z7CtFmHg=0&(r)9EI4>FU-2wDhAE-6jG;E)C`P&>4WG4stQz!Cefl^DQpK0M_JJ27d z(_4FGEhI9CL{*VW;4hfq z59eIX?}W5#z4Cd68DA&;8BbsLPs`0^e_{B8Oc~O3N0KCLsu%Ll{qo-}B~BCgWp7pb zkDr&~kKFEhBU|K_EfbN*2oQ|_0N+%C(Gl~qIT&)FuLJPkir{1MzlgNCSv8pEnlf>m zBz)tbZH%V1?0j*i-9Fi_B)0GS><`ooe=3>b>O*CJLrBh?_Ga06?5>MMrC6W&X50__ zdapsL-97^aX(KLcB~lo~(3Z@RNfuXjdaq4~+oz^E ztdO}VKZ!i0s4W`ij4&&*n!OI=aGDi$r&YW77;XgZ}X8#z%T-*^<_qnRjwtX|qA6 z>Q?4L5m{~`%Ed6x@dw5@7$6W1K^^L4)U?TEhkev~dZ8qXn^IH|3BVkw?SZ&@`5;c1JReN`D^6#>LT%If z{Z3ly*4M<>Y%N}UsYEPT!m3@DAOKQA8S&5YZWzW-6~gKk62T}_Adv#(F!ZcWjqmHr+uPWOo?jo*jY+f_c!*^5a5oTcsU#u&meU;s8dYVtktIe*qN=9{bZ&; zIyVQ6_ zN03{-KyD3n+Lwm4nM&O(8Z7a^p}?{nl~pZSF^i6i`uvJk@<@z`(#_=V$CdN>Y`xcTVYk2IS-09eH<+B-+!$-d1Ik z*W}%wuLGKKot@$mT+7}$3y7w-z4E3uk;GZljU!5 zxOL*X6-6a!8FIM4bh91pz0I4Hy}Xsesd%Mu$h_ku^uRspb&aI73MIFi68lO6uxHnH zeq8qBn#B=K2{NqYq5#TT0T5C;5xm$_n@NojWn^8^#~}4MT>k)`m4x7z z!fhG1xY=xNUKm|rXLWprP^{2g*?}Sq{KZjE&FS-F zAB|0AWM}e~4XZ0QM)x+mhC%sU<0t%UZr4P&mQ+i-yJ@zz=18?%6;DQIUROQT{&kzE z*HNghi01zQ;&AZW+ccLqh&b}v7cRZ+$ab_deBOD!FD#U#ES*O1_-id2pSa0krI zCj}69&nF`l(phNM@lJ_nYO%Ocv+TEbO0t93a}?wcW6$eW29;%PETN>)pis)>uxHzW z_lmTHG3ShqIO|!;t;Uo7u4BO-oQVh!=%}%|Qmv>o@kK5G+prV9Ty`HQ6|V$0623zZ{Zy61D2;laizk-OL=ZCq>C=Hrk}JTwgYT2sL4KovP~`B)XMV< zeLmvdlrt&&WD7Yh)l@MhbI8w8$Q7Gul#_nH*P|m#Zz19_!wf_Q^4d7$xH0!)ttbeX z>RkR6LVNjUAo;B^$xzPUY;xc1#)_!U-8UYd)mrBA?&3#Frqbhuuzt$0O?>;uPclJ} zFj#T(WbxOfk0Vr`;KE^^@@<>i8=XyHGIM~wLT>Wc+aUh{53O8c!L{6mJHh8a_MP}! zaCbM_F5+-cRoxULEPH@h0i5LGuLzu|WJ}9iHY0NR5ozfl=LLxQf&foJ&PRH+9g^5s zeXCHK(@#jw)mvK#(PEQ~m12dQgN}n4>6})CF-Z2zY^<52AcmIWIYUY^IX^F$A{#qdqf*Hl+7QdS3wV5>&H=y@vKeG$Tyg-dD_t^6 zX-KxYQsV<^BQEwQB+D<}bKe=pYLvb)(^lpv^y_^?QjS5NwOiUesDnKGxC0~Gihz7Y z(q z1XJuQsYY>+GL61wPB3!ia{LdbFT7*ni%76<;FY=C97;~(*BlIWu5w?C8eq2yH2aje z4f4d(fE(X%{#E00UEWI}mgCELTlm~$pT?L4&Ub8#=eSj@J*3}}vZYCR`u_mHvFkdg z#T)0g4QXwsu6nEPeW3c2fNPDs@z;o^yIaXGZeW=}07O+&kM=S8*EX|XC>vb>>NcwX z07F&eYoNzJG4I7irr*5oTI7oNRgG;+#ai9g_VX2`VfUp4!1vqJKKQN<@>}yMmgZSm zw`X~5jMqvczdYhW_!^C^F32j`8R`#DyeM;T_3us*2gvKLw!z-n@(?) zt>1T{Du1O{Z7Ra#X>iFf+IyK)5B&;{vur;-)NARP zhaas%t1hnQp3QV3Tf4|#{y`Sfd&aJJezjqBf}4%bj_94qx5yuzH|+N0^LroRE@?i= zY!@QyaP$K;E>U~0%oJKAI^LT6B8ajvfp53-scr5dxh%1~a$s^garxGJ!)gcbN!i($ zFX!z}I&PIDdD$U7a;~_g?r*79ljx1Vvb<_bT3do}RFDWinWjH|vD+`w1^)mF!8&vi z6ZTuXn49=_kVolJnROTr7FgyU;}t&5zlKPw#e7j_;>c5o9F^e&6OQ*7iDg zGc(Nwic0klI8oNObnB=4GOw7wFC4B-VG6fo>XYqwgDY`a)q<34X-a&uxzLH*f}aIaa`wnIAxRkJY=HI${{XF1NA|BMh11l5+@qiAMu<({sf{&-n}Sj&jQci!oo3Bt zBxixSdlSWVsJAVg>Cu+;3e<+q_dhL)4D|iWe_GNwhaK6Qp&4v)G2YzU@RG8R?w&Hr)xHI z8y7rq=lYsiB)0>bU;*TAZhxI*$)(;$))sG|=9v}M)r^Hc?>T7XT!Zxntx}8D43}0* z;zpCTvw@3f{{UKN*r6&)qHgwLDI(UvKfSfL9_ttM&02MdL$8mUXNS4lB2GaeSMv^(Dataa$9S%nn$@Q&s#y4#ga@knMgK>@~ zTVzr7`^`~!Pe-!Uw5x}>wi8L_>{cx6v|t{p+;#rxs{_M2&Y3daX!qZ~LAgXSG;%Mn zCmF6cDoUf$(mNMBioV`kn%Y-^ygdost7|@S2c@&bvHq>Oze?+EC(*275NVSvXxZK~ z3VtVP`LkSXKN9>;r_J_VT(qn@vyHNkrVb5snrDb2*K%Rit%b|C5l+%9fOjM=d;M#g zv}*4yeGTcxm)X?UX&{;)>kXR{Fvob^`V;)cW?X8K#UzngTR}F=7G#HLUtF<0YS^@% z8;Me9NuPGoV)Gaeeo+4aI3d~wbA@do%Iop*V zhcx)@!F{Wz+Q%CyIZIu+BezaIq|wYAt9zpw^TcW!antoTJDdJpN(RrUebrW96lymo z{To6EG4huii2MLr>|wV507Pj;q*0CGK#t+JAN%3>{Au`_C`^&S(tv*G)UFiu_If>{ zUu0a5x;XUlSBPN>MXjZ|2j(uB=cZ}N;>~2aZMGT5Bq{v!n(Ln7ZVKYw>;uowsmusp zqaIJSJ+0apIkLHSeia4rd>*{+P6jGmmu2@4-d%qqpNqu0?C`S})4=PJ%s~GD$GX>; z_|n=%)?W3J1s6Xv`5D{4zh7RzoqF_=TuHTU>~G2B5YGwv5tOcf#F}inmX&jDJ*C8M zK4=RFv%GZSS3i;OT4C#`R9CT`dQMRDUWbl{VO{?KfxJN@M0B}r+U8^gz*>E{kEuWp zY~`>$O>ktCVyp(?!3L~ZUdv~970fsf8*X2gy?WIBq#8J=)Z~@=9@!S7a-(W#h!J=v z&lMA0OpM{Q+42rpq`BMrR9ar5H5*7TEu#vPDf_mIk&d9@`%}{8A-MURKHOj%V&#AG zC3q5PJ3C+Idde(MwxdP^Sjxnnxf%Sn(^~S$qhvO+M*gSE!2B`IY{q7uC%A{~?AZP* z2@dYu?UT2)Wi&FpuE+a5i+$lGys3aaaNJUSt7`&G&YDN{J@quwTR|*pVz+5qClCoRzxeiQRJ?U%WRBRYecw5xAC3%Wo3hrysX^t@u0taS zmCqP>k-`4}fS2O3uKYKm-4fruwI{q#g&+6M<6UI8?=)gHv(mB$%vJWEeC~&ui_MbZ z(m3sHS-Jro%J6=kP^*n;M_9kaiBVGcm*CC?^b4&#G`e1>?~ZnPO%pNtWT~Z36>2s| z$kZ)kZ&Grk{znzvlPqe6T{W3`!4oN8f8e1mZ7w%NbeP&U;r{@9xW}g|1!}Qe>-UjQ z=8V+|N8o2OZQ@-j<{htcEnB#ga-Yprg=l4d&esoghTJYomSQpW^6Lv;m%?Iuh=!MS zYj+!CZLF!VA5KZn*A>W#Fd0-ZL)&$AQ^Lk-@w)Xgm1hTOCGtAmN5!$iUOhVu{TAU^ zfWGU%Kgy!f{BNyCGTTn=eH#&hBarHSLk~}JT*!#+S)LgTB>mtxKaWc4^xa#-(L)?7 zYcvjcB*w4!mo6&mQKtx5LztR%YEEAjj^j=79-kaaU705>%w>1{-`)+kgZ1F|6}xGx zTs)}hEQua4F(R^nd*Qev{ge6E9c(QwV_z=CW?pzh8UFy-Nd9Q=^L&F}sbA#Hf0~~S3e-u`}Y^qM!t%R{Q zd+Z-FHadgvTy4DW>$R-g=4|82RzLa|e&eC5KW&mw^4v7``&a;_6>zxf4&qPa`crdh zU6II}IwZG>FWx~MR_$urzR?A%FbqZvZmx2lOl_&*y1TlKru$8$(MbeA3sPmb{uTLH zZCv^j)7Pb7&8b{mBv7pHwj;C64yz+CUz-P&Kjh}DomrwWtXB^AURtM|(rOaUHt`4A8^6)60y1 z6l)>adBG`yJ^87b?k17szqWgeTbP|%R*DFnhZ|L9QI2^#8w6voYN*tE1B?E@;OY4s zs_3PyA5Mo`7MB7>GNq=H9yyb|fsiXQaveQ?ulOtRM=kcva>yUEUFdg~aiWFMt$fsGB!RFaDQ)2O z1m}ZVvG|w8PdrU~p+RdDfskBkQ{1J$nT(8vQ;cI6>59YwoHX+5v`e_xqI_=Ck) zD9Lf8>AK5+OUtLq%!z}XNEC6q2d@XcMR($#5TxK->H2-2l^|JcrI3B1VT_HZX3_HC z0n>`txQ5*M$#|;HsJ5_2a2nmBPNb}Ht1ENd7CzL1TMJ7!P|p-|Kb96T6~b*DetAA+ z&nFBAUX>1{cd{L%^!*p-Z_vuO)jU&m3|gexUBvdzIhN+?WsWhH=VWR!0Q=b8z#S?` zC-D-2r&-$gQ(0?(SesYeVP2<)E0KT>LF#>LMr(*f(X7j8%)rQH^3?8T;1U)=8y=rB zC9KLBV=1&FFxqf(!;nG8uJ26JRid85aFbfU>*xOf0IxmgivfEZty0HH zjz`+9r^Y5|x_pII{zb-n9GYgDHN)rFjatIS#nbPL6pYA88;r54x6hAp+ov@*+YP3n zA%)w@S$8%_WZZHas2#Zbzl~CtR)Mu6A~=Z@{HmJ+5#QzcdgrAlS?r2X{H@DNTq)=kd2 z6p|Tac`ezPZAMXmGr?l4Nja+TtJ>}}OMe9NH&U@T>6(`+k3y7f)YP8WN3}<^xw5jJ z7F~~W(2p?Z8TsFl+Xt`Gvmx;=kKy3SWr<-|E{WxNhm87;%xrccW*O>D?nLwIrjevnlb|*GHv86gKJRUdDyKVmCfVfbDfNu%)-aq)7fa(_e%PhAcv5sYL z@TX_lf?wD4$CZQbvdYJ={Cb{0@#=q&B-mC7N@J1a?a%6J6MqxG&*wgR{`HP(MZS#` zpjiTSAd-KrFL`twlk7RIB4y8$aJ=M`!Ig39=uBPeL_|~UAs#5q}(b+|P zq}{IQt*4XJd8?nx6yVxWBRXuk^=WH@V>P^-gByb8ZIG6$c^*inN`*3auA5%>vh-tt0V#bb;#jFWUyl}CGRLWsTDd+`rI>}Rh<$Y^f zTkH7a5;VlBkl5!32C(%RMAs$M{HL1Ugdeo$ze^a_l2rLhMvT!(+PlsXLFT0!v&R(; zPc%;qg&0r<_LldTaPEvdGH}E*e~oDyO~e(hn$roWpvfXd z{{VWqD$)0@nmXy$($41R*{`#cF4o8Q#($kPZS@_;Y^8>I`GsSx`y8C$BAoYS{{R|=<40qJ1K74|Hn$n?1Wr!gC94^^%d2{407F-N%uEjn=C;80KTh=@GMbHZ+NbbZ~enW*WysND{#BDiypvrg@}f8(SPOq;_$YF86lz(*f5 zPKN{AkbgSo@4N|j8QBNE*vk%-in2g`< z;1N+$grd2(^fZMxw);XFXm)M+N3uDR_b|s7ikdV z=(#m}&DEeJp}D!p>7Fz1O_0G8e){q>3OdPg&QETHRachI-v&v_rh1$xZ^!=ttxum_ z5Lx#x$$hH=5JMww>J>Bp0J8B(CZ(%DMkAlwY-NZ3gIG8ErI24TEkU=Dk17ORi}hyd zP(9_++X<0oDcFYv5A~*(TLqtUmhpYN{{U*W^1#uh!OP_^H}~z1Yn}|l{{Yopo~v1N#Fvl~Hy%9=df0jMLfwxxv^gcA)%aJ%dX|u0#ua^)*;!6O2cfR>N%0o3s9K?0yNP5|hX-qaI_1UT z(|qY8uMJtX%HGFWH;C`9U7C1oi(mjg%t65E-l1(~=NnSm74+DrpUfJ^eOpqt+_I`; zJHL4vQx3a-$VIhQZhzFT9`(TKrthKDq|Nv>hqsUZLD9o#X*OvtR^t z#F;;+HO>D3Xk7mQ6c*>yj=#>B*YYau7O)5IHfE*Mi+drd9U<4J`@h-NieeZ)(MoU8!YG)t%saq1WMR%4pJhqqDl(jm!^0T*hM!*!lgb zv=g6meG;v%)H~lXdK)|d4Yy8r*scHc;r$mReIB=>(6cA>Hw#a)+?GI`8_HyFDZb=! zGbgRyItX+4QIbj9;45Ml3*mCrS1h?fb?-T+9X}NyE$cR665Xcd>+-vBoM@!~R=ek)P zU$vIz-7JBS@9*jJTGYdKi4YG|GGLy1AXwkOs2%H8njaJJ$C(PISqU^K7-nHqsU&nN zU5Hqn827}x5znp%_qn3Vr%$vHQ&^P`aLP8{)K`jkkYgPu2=FZ(FK z)0~v}G>cP*a>414xlpNs9CI1laMVnv6;9l2GUaNX z&zQ5YRs8YB2W6%mb}MV5_;#uA_Na4~qk2^*?W-GUhlJYV2|M$x&qqDz-5I?JTW_$d z-Fr#EGAZUj3jLDIc(snm(wE9&95j<$s*b*$wsAHHZp0(*+<3(^{)O}>v*!-{z(tB%so|$k^D@TL(%0Bptbur|d z1i!&q+;p?@S&E6I&Ay#(p1XwKr&&Gj5-ezEWc8Vvs%@Zs6A*SU_@RZ|pSyg6}5K9d#hyb0-ruVcwGx%;WYPni{g>9`xif(BD-th8=(Mez<6nK3vw z1ytNy6JX>HMrG?<7`Ss}XaHRl_%rX@fUctW-EQe^PyquD5F~SGv%fn4+_h z+^mN+v#S`nSN9X{I5hB+6TTU;=q|_P7eosf=CtkAFYA3^fV8$1PyKw8CGOckhg{$y1-g~bZspcv0>g^xjTkLW$Q#hCy*A(S%-fiXMbRdQTo*cj6z`ve=yPBsJEnfPI=-hF>c^04qt;wL;$EVx!A!A5 zb$6#i-w)3w8As=Dj}AS%Law6U5ag15Dl8P10%exL)6CO~_sg6j(E+ zZg6LrT<<1y(d9KYoy7I*a+CsNg?^F zJ&B4rv(lGM_K8>M96V+{b7A+zOdb`SC87}>uG`l!W?DZ#e^=+(7#+4N9Nozo7A4QErL(14NuW- zw13Ic%6MK-n91hI@e6>cl|JMpGn_iiFVy9qE($MVA%|b3v3A)fdP~NU z0$!V5C6HNFO8KfuStH1K+HdT+@p0yhw};QGKNVz*zi2N@@nnPY`A0DlS9q8YYo3*< z=>u=JW$zc)A^A_iqe1P3JX1}gi!?&1Jk3sTn@@x{+ucud?=kD{GqHcOqb<|oltX&y zrDJs|4w7aL&a4C8!7jrkMw%WkOj?Z^9PB*&zT_+?BAt z{>&&!M#PF%Fg_=+iD%t7>FqsJ&KXgz*4<=-9OR6h6?g{$Uh_fjOQEZWP@3@JD}GYf zi%q9PBF8kCFS5{aNW2{1kdG!mZ)=&0Lll8!c%8#*s~$pMKa$MV-eDW}8+Aq9T}w4+ zV4|^f^L{R~MrjhuFJ?{Xe5IUo&!6hJ2bwTzE;=Yn&veb4)<(m3Qc+!5fu_Ymy?!EI{&d4>@#Ib6^YzB|^D zr;k51x6Qo;wO{irQ!~mb@jjtAa!#D7HK;wF&fp!hY+Aqd`kl)Vr>}FClvF~Ysc36+ zvucOoXQvZX`LdA<0)E%jydR=&xaXh37=0Npc z+~Sp>C5>n{$JK~ND!fuuwo^Y7xws|3`GTISo5gCK&r?OKcPQ)gF*Z!Lj8=OnN1g5O zQ%$NU49+&4igoF%mZuOnbG3N?K?NgBg76&L-f^a zo3yd}3adYNpVC0)CpOO+$EtVy1{oHEy8M*4uO9!LyT&156fl0=H%G~X&3q&ycsI*o zwrN~setY=&S8paOb>lvfJBE~LGEaRTJgF9(t{~kX=pckR#1^i4sgwB}3whT6EJ>QH z$&q&7`0R<7SccRLdQJ(Q6eR1hG-cgB z=3EI@A3d?dHK#lSWNe9EZu#AWqnbCzjNW**BU^0Ihx&w~-0CQo*v=BFE9~`6eTtRV zg>5>&>eg@-bZ4ct#30Ks^2zzq;X`wQXEinS(~|mn`_DCTaRlEo9jBZlaR!I89>dlv z_wAqp41nsf{C29|JevEi!snFXI<>8W8bOABOb1y4_SVcsuhw>3RW=@JFJF(iQU5mU z!O4mJ6bL+xjRa>4O-^Ugbys`Ofs9$|{&2_Gc@Oy`Jp0pvZ;o3_R*ip6?(3Gwh|$Mo z#-{GMqgNj99bIm8BD_U9@l;Deowl^fm*wJitgoPVy!n;RpM0)lv<36vhr2V@+?$_# zWI7eO%Dzh^KkM0>H>cu*bPks}v46@5h?$<5Xm_BHwsZ|~>EU?_C7OJr0 zNc(QKY(x}0+hh^Ldo=6L@}!fF00A#*45}y8oq3%xEggR+-9o|M`kXl{f$`RCXavnN zw^d@A!oA$9M_uE5u9h+K--mXzXK0t**Wp-h307Vfj7mYCj(W}4Bdwob{jq9NCi8iu z>}l1r%!Rp07VoMs)z8HH*a+o|bMa)ya`TI$Ut=q@A4+u{+<_a|58z6I-JG4=N}A${ z_L}Ed$NgcuCj8Gd5d+yBq|RZ)ceR9c$A^xfk3 z10h=Jvp9Bu5aQ*|IS**(Bj<1!QlR$Zd+%t-K^JF5Ya2yDtkdE}nzL|vmBu+?&L;h5 z$gJ_(y1BeW4NCue?)wQ&DeY@7)PglHJ*ugs{1`;q{mx%5tNPZN_5B0hcrE@=WlhVO z#LoI>3LH)LJ73IQ4TV-X^>iA$jx4QC6hk`QJge1kVYt#= zM|B&_2$#w$KdODS^DOZS}t zk%BB}peR0c^4P0mkX{&BK(1m&v6fHKn5EyfuTZG7J;mVWL`PNqi&pe{?)@OVX1P%2 znJT*V3C(vAv#M`AGo^0q{m8(n=3neUH)8Qsq$r5yrif-%Nx@^R)C;i0-+R_qECy#< z)OZl6!(c5K3siMLhw4M)SNiFv?xoMLI44%7Bj`ph;V@HvVPK9wJTL z8}G`@3c_oG8AFy2$00C@W5-$DgD(Ekg-eSb6wg{>(-Di^j-vruvNb}-uTbn3B)g>B z-&j};eY*d|o$HB&E~$Z;>;p?>;S&sXzUX>kG za<-KD#73qtP~g&^C>J>TjL*i2h-qyrLNXahJ`g)AEYi!Ef~s0rNf0a5ty0(HwT%dw zRE8+8@w?vZI=;{5l5@PLlwQO^T}z76Ave=#NPhU%qS6ZA=@?Zxmg=+?@($$_?3&Lr zT86)N9m}J|KG4JN6I_GnkX8|D@4^gvj#0|)9iU(8!AFicrU{+bh&0K&J9nwpzhjCY zJE>AJ~dVRlc0FD?$bc)-AJu0B;ZK+K;aQUS@nEayIn$+q&=9jg~YN@FGCav$y zni_p1JD})p32K-i=TMD>NklnoNBS?=ces#@Qz_R)#`Eeb?2ld*c)fo2s*rq&(2ztT zh@qRyyD4f%d&=6~Yf$9YBb>9`Rd3*M;R-u~r|LMA;IMy9YjLcnajBL>dJA5Cy>HB^ zM{|XwGp+Qz#_FS63QcXligsI>w84tAK*AFXxqFUA(iv*6?&KTrT~=B4$`RX_aP5lj z*<*F1FB>|h{gU8Mn%54Gl}p!4dlc|)cy`}vje8+gen3<#`#}&~BVC3%NCp?h(%SN+ zMcB=dN%uNklSZBWMr7D*2>+sqHh&Wac&DUXX2mosfU+e_~*8Krm9Hjb3u~v za=huufaq5Z4u&6U>LT^`r`*;Fj6M6@bS|L%Ylzs9FZ^P6s;l0ox{GLYTbtF!Z$)5kps)eoh$R%NdOde_VhEUoeF*^%CnBM&MFwP4JhX$%1OvuK&w;FxC z?siCPc%dM~`E@pn!_sST+Pgd-`RW^eKaI1LLmSER>L=>aX2M_MyI%+IHrm%~iClPJ z+Fizyal)^26V_!RZG8a8ljWJ9bPPU~0j3CP9BFnEL5iG!3cxJ96s53EZWV`PC*Epg z`J8z0?jFP$s(5=q;tK)lv+Qq<6sZ0v3g#|-)R0+8JpFLTa;Yns@e9Or*F1bqa8`cS z4IA}IPTU@7gGE89LEL&3R|R=GvY8;p^6=D05iyxfSh@QC+@u=0#!IeXVpX~3mq_H^ zHxmuhx4p%eVx$V3gKRG8zKK+Cvl?J?Oe;Fn7u%qH=IE%V@o?}RH5Ae5h0tS)l#j+@ zhLO^gUN;$1p6j#=^NvKi2S+))G2uAm8&7?9ciyG+vXM-}XUw4Tojfa=Orzsnn94YJ zcN2s2agx!=x`?C%xG^!lrgcu-QTL6pK4A21q0jN;?h@YFWinFHpUm*;Jmz_U_OS`G zCS=XdLnAu*jeHAS@cQ=X%EManBgvV%nS-sy#$^(~MM+W^skQ5(ncB@ca@6GB4tlP~ zvxk@Q8UbQ~%Y5WR@5$kN-#eVIz&hO$yI0WNK=9H=K-ianRg z80udD&A&yYNxnb%Y);DH6?R;wEF9rbtE6hMc(YlR;)(|I&}TQwx4kgYfN(g)b_tJ5;13A@B{ za>+#=88VCOJ(w%LOZP}ivUf(r6t8x!=@_RgRSr`2c~p6*S5CMhQ;s-Dt;(n#VK`Qr zqPkD6Qc~Kx-x~Ocp}T0#e6`sjwyasN8{u5{Y4U^p&00(5%CjM-s6@!r>0YMG#7c~4Y>1G$TvDF=W2Rma!%dIWz+ve1KyrR@R)LUDk zA|ir(zZ@3k{rRYv7&W)FwUq<)FB^i?yh7kN)$fM|sCfmy?MX;bbKfv@a4@#B+B$V> z_onh~b7Q0LSISu$ni>mH^YMMZ^xH142(_@luNOHg^qqsCl;@ z1KjG{E-y9j_shTC%g)+J&Deoj7u@!mG&Q%nv5NyW_qTGd{`@WZ^S2ze{?_ySTw!5h zP~=v9f2q)~8~;DnjE9>4dn0W<>Yr6Jk*(JG12q$-=Hvgh+5caCCPK}(B_N;=zO}6= zH9ya<*Zyf0`&YZ{o5cQF&2M|W{L}*9EAmZx{-~t>nf)jF2m9~uP?bN}V}FCH@bdi2 z-SS=FM5%ds{teyuXG$C|kH{ZW8Gna<0?+t|)Zahq^7rW{U@HHey8V0B`A?O=Z>H#f zqtW8!`1V3n*-Ep}4O>Vhm zjm=E&IshL;gqmB@`ku9&nvJ0mIOUqLlbMmRstj;Du9`X6-!Qh5w6?UdwlW4z6F5~( z(a_!kI6mKDGq<$aJ#Y_jm8AMLVIkpPfK}AJHy@#5PnwpM-`3N(|zx=oP z5q4+~>^|%X#ST{3&O^Z`(cGOMT=h zBNOxKGwd8^IRylTghfOzN?wzamXVc{S5w!})VisyV|2&Z#PqJ2xxIsOJPTzta2#P>a}C1Gx$&( zqTKZp@9x7_RHzJX=~($5?4iFJ5|i<1FB`w=0>f>)Ch8-n1O`tpZdL7D&Hh-$9{yW3 z`?X^Ks8=^^KgABvc@&3W2-ql>W|h>7F+nF=M#FHJ)tStcW``UVIvyCyERhR+_LEwL zyg`e4+D0!fpzf%cm{CpZ=q7BCDT3^VA>k3jGDJ4MO&As=pkic*7P)=WQ|nO10;+BU zvn*ykt0%MUwU79WoqXsOG@b&M=7=K;K%e=!iK`G{1*$BvY_*Cb!1od{{+lo!a$7kh zJbb8uNrR@) zehDZB5-@p%a^puT8;$#p1+?(bOM6ayplx7!XA(eYx&@Eqzm6?oiv5WAe%id$eXqL> z`~s$5{b9s5#p~oj=qtRsEr8G_u83*sXt5znzf-sxv&hys5$i|nBz#0)ZaHwihcrLI zg`F;L(Ostrnuo3+70QZQabAg!qdjdApR!dn4cY6cGWujQ?71vzqi$1^cV4yPB zOwjWHci6n1F303lt+=?1_2}>|ewnlG7qMQ6U&daYGlERLU!Bw&*A>0K%a) z7F@|;7}_qLT3uJ9HWC5Sx|urZ9W-fP%g* zw6VL>3m!G`qDbWvt&KZLAo2=;*`lwvl!L4ziNYP?wCdCP=)y6ZO8hu!i_fChv+l){7p0Nk+AGR&x=QLu);X45o|;w-=h|^lpDHr z*jWQKH<-Zf4klhB?R<{!Y0zxdUFRY%kaJe}Hk8AC%b#5U1E5~HgnBx=h2TwIUty^3 z&`L!0qSi5=OXGajla@%zRN@&gN%X*SiH)b$R@0K z6PDl#?>tO~%E)`?pj0pg0tE#TDW>GPMll&xq;V1lfd%>sH3!Y?g%Ds0Atj<6%v8o= zs>nN5Pctm_cMg-6$>A-;3$Euq+zt2hU994hJ*BtsQM%l85!SA;q$T|(+5T%S0s16V zwn%Bw7RYUCQ3MEN*m^fdY%%}D=VIR>D8Q)oQRIP8&701&*~-y-RrhGDQSL`saVkGO zaZ#A|Lg3rFrKP;19B%kzl(O1}R*vXfsGFsGxLEpUeu!>cj;OYwrtC6C>V}VkU{8A6UZxsf%-n^iY z-@aaEk9yLrOvnVm-6XCz4ax~q!WSj7%#yV!&1O5J&`l304{bb@vL8w+d~UpFuI1tf z4I}wt_VZ!NixN?2wvF)N+^EqehrM0~nq!l+NOiTzDQJiGiPIYqNfvHqnVA_E?dr-i zaS1QcHWDfU9Y+tJlgokVrVY;MKbPlQ3vcgXo5<2z;tR?w6JNjc;pAf>^pK|+AyBF? zux-UDV=1sV!ns5Ph#UoJeo}QT1(`P&L7HDoifjs}nrDlS_2XJ9TY;rDh-dBF8uNzj z6%W9q_!6W>8Jq#Mga_Gkp==y~G)vn2cxzG0bgDL&Q;8}}>CcODkGBrCfNbC4CHz~ zP9}<-xB?L-nn)!N$q<;98w?o@VImfhWA{t~+Ch56_g{Z48M3~GK`lTd@XihNC{YU+ ziA!iO!u@i)6TcSw%Wbdxde<;%3wUD~*$qKjxw{+>;)ft<(32UF_W_CBKrGSDcxWw= zxrhw`%_f;*lZbJk!uq9yE-K?DKuT>>9Kn4T@!JcSPLiuCAnqYdeP@FP#yaE%{7p1$ zTHWK9E5aaESEFcU!6=aNz56ZB*tr`^3_N>hBigt=wL}XS0J?TSIUVu4L6C3g{1<=c zw_d>?_5oz<3|)WU3HTM_e-B{%>IFz|xf(zF0sp}LE^lUgGH5=Y+%I)S+A#Z`yj+d! zwd@D7Ni+28XGF9yj%S zG1}YN{Cj_QMBNU=A&x^h_Xg2wr)Qf)sR(AT5U<;*fsf?zr8JWX$8I0!N&cN!(l-xg zs~P!3dH8<`xbS@o`fMHL6%zX8An!La{+Aa2BLSxG90G^|fgsgZJmOz+p1%h#z=QC9 z^Z&kI{bz+Qcm=m&LH}}Hf5Rz&w*4DcUFctpBmTCV|0h(@U)$?I*#Aep{x1pm*2#aG zJpa+)@V`WK{U-z(d4+%E`+jQ&{uw(Y2w1{j*dZYio_~uS`W}$|)h_%VJM?R){O`jK z?R>xU*#EIZ-&n7|Jv)R`_2YQE)30YC(Z`XSu3tR&74wi_->K%qYynO+uqAic-s#T6 z;_k8=Qn5NX?)Sa=NZEp~MUPZU?VnNA^Hwc7o_M1foN#vP**PUc$^AH!)tR8(v85rK zu*VjI+;~dx;;8z5qCR{f&ExbY>@te@YAfR_O(KoWpWsYO%p@Y}afN<(88QR<5oQ9+ z0}11Y79F6^ISUb_ozRjANGc0TkwxU$Y{Kfe%ZS+5@6qwlpw5N@`2hmQFT4pmDnsN# z?uEXvgZGPW&0L?rXM6u&$^X|H|0f&&H&Xr%dc%=s{PQMkpR(0rh4J~UT;6k92ee&h z^FwB<=VjX7?GrD{%OXVJP#0AU5{LR9$N$*y5B~J7D ziO)l)eUg)9@G{l(ny9DQEpZ#ZDlKc*pITpv_p&LY$5vN8XtS{oU>VUgc1_dnBvi{O zajJAZ_GDY|C(Dh6ZNe;}75k2cww0jI+&K0~Wm3*2tO>0v1Vu%O%8O!6_$o7lRj^))Ou#(Ov@6D4c^fFw#x}DpJny zGxRD_u)pf#(YNSKmOqiU3A=|`k=Ht|A5_tg<8aF(0m`!Jte&H5xWpOHIa<C%(y9LZV^C?iN;Dlj`ZP?&)=ZZ7F_!CKeVq) zxA#`YM$snhG_5A+4kZ3G*E2L#NB!{7GT*c4u7@F+2?ft}n9INXjj zn|`_#v6*$!SHuvBsk2Fnq1WT3iWEKUFZ%3QcHZ7!PKcw>NX%6K8koa%Ey1>JOnu|k zdOl*^4H`rS|E9Y<`jE0j)=gNbE8rn}kR;~S^kYL_zENQIeo++>8Fz`9Bep0g4Cto& zzx&~`_=HzyZY-h+Z>bKsm~n=W!e;LrjT44-#T=XRomC39@sh1HMw)zoYT~9MiL)*#$2!4CJ ztUjSyyTFx6z@CaNvHndl+Zsh814gc>a6-ENvG$c|Klz{4ypE+33vDRh(z3O)7vR`s zVb}TtLbH`TR7vUbIRVeZo5nlhI2n$M&OP+wNap=rVf-B-m9YBFXiu*inkPON*ClsB z6VKBx7T2QaB-W|O^U6f7CF<=19sN3G68;Fe1Xdv?Sb%7=K~9;6@|WeT{&f(8_FpkI z0s&FmB||Ct8#{jIL#4KWB})^K>!-kSREqA3E_r1L+_71x7g)V3g0q5UPdBh>)tj)s z{uE-ciWEbm`%k}9%I}rDb&Tq@$DQeTsK=TVTHIa2KFqGqM(zRYl#R7b+#oYca0`Km zBC2lbiWI$ORpcKvEPtILIFLAqA_lHu=75oW8@U#_Xzc?n2?KRoz)pT>(QV;Nw}{dW z0Xhkzg*yoV!JDZtyMl94f zOy+Ki3Zz%juc`qk^mn1)apmoDG}ONza0S)?IN2$aC_Az{4zLeP!T|{jcrHWFfO;~lm1$hL8--|X&XHl;7w2~puk&s*cpKjmvM^($#d15_3jJ!z% z)5S^nS`S#(3oV4c>buai=$c39B_l(@skvfbV*eO~{VU<7BjPv0hmD-T$*8!ksCxKP=y7F7Nv+TId z8L?>3w|`j#lC} zVKO+s-9p+7dGSx%A=aIzfV%{Qz&^VA59*gK=)GgC|Ht7cw@u~KBYcAV8!%tTm%iEB zjIlqe65+xg+r`OfyRT$mto#(^?hUkUB3PAq*LGR@FOIFwU@7~AYu6w0Y$=kq3sbS~ zb`v>RPBd*Ff#&Ah$XhZnE*|`Q`X$|>3SaT{X`Gb(?HlqfsitmojGR??>Pd^I8vrP? zwqZz~+~oVK4$by*Z(dH;GP>4YXkY{C|FL&EfWZnN7m?$~9j=hrBGU6qaQG@wnMj0O zJD|^>I-v0cknl+Vk4=W^CYem%TX-@%*$YuTjTyW(jahE)CzGfV|JWS48`mZI!E*~` zBj8^h`xibq_Twf&?vK}F_z&B50xB~xof2C<0xD?!;{3+Vi}S=4@a&kYK|gF4FmtTk zn1LgKJAkcPj|9l!33pCE8{tZvgwpJ5I%_>s9Cb}>n%Rq^7o%pJUZ^}}_n1}vXiC8z zHpAJhB7OMzw9>pbqBup$0lY%*p^GosBZ9mMW8VN+d=L8*XuJlRUxEUavwiK6Nw*Q> zJ%m^FD*6cG$>^WJa{+=LQ?r%6xpxSTTS5|us9~9GUmN@jHnJzC;7tWX3vhRS_-*@T z?y}Cb|Kl?CX8VeBBh++zFLcG)Ju7J3BO=fm))k`qSVw-9wIi7|){isb_msr{28EzS zw`fh4N`dfrBx%QVZaVu=KhYn!_p2)yJVpESQUO~N#9Y>j*fWq$B)f6zx5Fg;^!kwTWX)FPM0 zgR=wOS6yiRFe4^y(DaltX-|jJF(((%^@YJHW55w~d*uIk>CfALe_%A^bsoT}958N? z3IMx{JGGS)f&Uv~-cR02%ew5=98vk~TotL^%Qplx!z6tFZxI7SiwE#lLMp2!*~$0Y z>N^w{ddM=z;PEItg}V9%Q_Dk;^P4z@;qL)jZke`Z8$4TpGeFf1;LSuGttk8AjzkReWSJF~N`lNHe!URT7|IdbuKn3t{M&noddd|T+!4U2Oiw~9RO#LM zD1T__Jm`atWZoosRlEeN4r0G7UF3)@VH!Sx~Xf~k~F#1%X3F#7iF;xX} ziQgQvy2?&|WRXZyB*sG6uvz7si1nM5(EIV)U$*~4+}P!)sMa;L)U_}%{tK5KMA!rY z4Y&6K@M^CpU%OZwnFWULL^{y9keL@VLZpTcVbp>3VB77lq8o5Fb zGL|v#GV$3(QGm{D#e|TlN}3Gz{#evqu-s0B4p3DI<0O)Rr;dIZ!7XA5@$tVaFepVK zAF@^@%WRxjmvUhzo+|7h?czqvPsWiC>syc?MxfSLnpVx6X=kU9fYPD(I)SF#b{puk zLK^f%b(u_#lt{ZUNGegl!3W&?7zR*3Ja*8AoFf`fZ2gTr`4~8rM73*?5}%sk(ct0QQGwgzPsIzrQ}o1B?9>;y171L2HD!7 z2`+cg+OLUQtr(;Yu1%XKk*ycgl~W|fo3@%r+I^N`0J#t{jaXqQ0iDsXg^OfTfqm3m z2Z%~|qiNOjd)36({1&)3WLHsm>aQTfbquT+j@Wh!eAXO%s}G|IQZ7e{r`)ilUDnY2 zGME*xt$0uE^>6Kt_56+YM{X~?AyS(?eDqb zdUf2)0`MGwuQMNVlVnj1B6%O+L?RI}fD&m!O22lI{p154KwbG=XAL>CwhQz1sq#wP zw8DqzD;L7Irk6CXXcZz|B(D}zd7mnX7Iyn=O4ZjDu_?dErFQpAz7MC^eeQ_dv3L1Y}gXEYo)3MVaby`DCZDI(q9 zimeQS2^qLbKm4VCR8FP~9E?^0Ot`C+mSPwyt|)!2tP1>ii1IvQ?zC ztA7i0xAy-!ihXBM3ip#dr(7G|v`70hq%$7sEkE!Zyqol#K2JF2X^O99vs1x}P@q!> zUNNfYy>#^H;P(@J{5PS)fL&HD0bz@n*%E>fzXyD z!+4dve=C7S((Y3YW|TzB0N_fq1Pj*$oCY`uB?$nJrXuk$G%DpcN7D5)tJjy{L&{&$ zhGuUo(#lwL#%kZZnBFeJ(m@L{JcO3X55T})?@A9FiXi%hLLGO=j>8V`)dJ$2{2Od@ zCbUKpxwuX|`0>=Fw;OK;f)EO-m!7|d*K56L(sgHa5^IYb>gXXJ#hs{1AYN{oKW_A< z&SX5GRsvZM>1pY*JUwT=m|4hEZ7fRZ&WtN9+=Tg(Prp`Zy`7mKoQt|-Sq5Ld=f<vN)_QH(0A^eD!EYY3MZIYb4*ZGaYC49S!FV20{RRHhzJQ&}^ zE3MY)hZic&8v6S3Croauk>1{px-in{1at`4(AL2z0i1Taqjx~e>`;y_o@JK<<^W>E z4$ua(iDZBCv6O@jTY&Rkpa3tASkcweP<{}-n-;*rjfgRKOB?cq#k5M_RKe$(A@VT{ zdHe@m(}#+!L1P8lK^^pDjf$(Bs#y{#f@syp2i&Y0$0CU5da!t^9-swGf;P1CP%E^6 zP$_M2B{6i&7I*vJIRxe&T~TYmq5#JCmUg=LF8JwF&a zrl*id)y4X>tF?r?JmGR zk>CwI8{fS0$|;kW)tDFz6(;5tpasOy>6*4!~FA5lh7B05}`|{)j&$UOB z-C|SotmHM9o}Yj!qYmlCuaON5r5J8Je3;2sF_h?s5+l!jlA_mmv(4-J*@^l~9j7h- zVZ9$f>*|aIs3RM~?@7fzg_hX+Hen@jvJaQ?XK3l18FFeFrFCWa@fLEEoIJXFUM@4==*0`l zV3NGIUPp!OwbCQp&u+hhQzq~cm z*f2rLYP|P$$qY*YCSpZQrx582UlIkvwFX9~RF1tyGZ9Rb45?EF;YAewd`fBJM{MvX zFm+N5m?k3D5`Am~s6IA5)M^DH-2JpmKX_v15%4mhv&57znnjcXa8!ysI;H^63G#qg zf3X=1iGyi8Q~1$@=mS+A)iQTj@+P?WO6Uf6J+XLA7=znt(i2?~)n1NKa&{G2XbQ=m zjzHmz$&lCzW*O$k*LG-J{s?nmXmlayUp(XI;ZxD~iMQMxo#I8Y%37$iX#JEq1hXj6 zn8|<2;30?G=2>jqALuq7P-mS2$*$%WWH3GZ77qa0=dW~TYFL_9)P3@k(ehzxd$QClH@W0CxMY_H2^;|Y;;#qnhH_;v!Jv1& ze2)Gge&{<}9nBh#`YxK9#|C&Bfob$oSOxm7kd}o@vJ7uA)m@PmO7WD%nr%bfu4<{_ zLP)In#q0<7&ff`<`{ZFr)fU7sUAljvSOT7F8uCJSQu?xe>g5Hysq`u}i{wnB`r;@x_N4{2u2iJY6!CM<+W_`=k0o*I-<7lf5JKljl zM1MU3cPN>cyZ2(INdLk9I{>i6IqeB(_^1+w6zyy55LQzHkq0ePgsMIAip734*;2#S z%i~}TX~E?RX$GstH)Ng95}3CD(6)O34p{EqLxm?HSw1$2C&nHmcx8kMC@tKV3A47$g++ zm=y3|qfcL_wEO7h{w3v85uF-)xtjm$oEJsW79fXX{P{16;m7a45KICJk=AnoMf7LZ zBDn*t{?@O41JJ#!p8W(6GMTuYO=JgqmEoH(gROnmEC~=KsYW@IS0Fqi`XZtnhqBfs z)c}aKBBlv=`D+{Z(QiG3!8L237=sLGVrQw>Z;uq%>az7$Lr)P`+h(ImR(2n&Hi~4m z&*)$ieBp66=CJabF=kXVhh)%yt)*eKTYho(z9;wD*{7uJ5eu2w5N-{mYIQ@4j#$P9 zw8L5k!-o&yDLQ16aUyHvQ4$59vDs8V4K^_EraAHIE~CR4o=)O4NP~ME4m1ZgHwK6Y z(D?rEY8JWV_gvSEsQ()9o@zY5%}!&P7vH$U{EgTA29-@R!`1gYWMDf!zSHZKw4i9P z=af3UKday)!6^aKMEMlmN zn8zM-0Srm**f|;HVH8u8hECP>b1y`3BVtU@Cn(Am__rZ`4>Kh!4n2ux^;kU-qt9Iy zgku4NmH_NhYt8FlEw_)?|>mK${8-`|%Yo}teh(y-X zP1q%Cq~*IJ&P9wLa!C+$h@Kh_*n%YadVF!mAvIdb=+{{Vc>r5Wl4IaF{!Lh?%8xr* zuT}I7KBJ7rp_~i|yTQFA&Q3n!OS+0!G6gPl+z;Dmq1Yfp>hX@o6d8inf(GAoz}tD5`PIVCbhC>8}cx^|cyc!_ieE+$msSEXq5Vk7C2~;%pHFaoPxie3ZP4WQho%ux$ zW+!LyL^53;1=%YeFj?oxAjIkZ7Q(io)<-d)i*@|Q@!7OS1p7l>o>(F_W+hF!Mw_g7B^15zIVj;uz+PMJn<&k_@R0SeW*L(9tC0 z-t|$HwojsU&d119zCg`Ppbq>v)PLFjvxhvVg1X!VL%F?s8X2BU?GXDSkUk?L<%S`0 zDGHMzZp32$Lb^TKyxPAT3kZqxzE zRmx%^+lv zf(4j( zcVOqyH!&+akci(N(aTQ@nIA!vqhcN}Dv+gs@=C8U5H_g!VMoJfV?RG;9TiFfG1GMK zDqz9Cr@w9+SZxhj3Jb$znezy8H#Cv^9@xi3D)OQ-QDIp;_ecM0CVUv8$BbBnk@FLm z(yt3wdR}lQ&n_dAs++cwhoAfs574q*K;1Zk?_IO%)O}AzIvL1J=;XVhl#58sy!JHQk-~+`h&Tr-NH8XJ{MlAiBF4AM7yd;2FJ-PWiZGsC##VS0dIeFt9Po zX|3Z!wi>&JVMni9kXss3a!SYeMdHC{t zQ~XWxLtJ+|_2C^zTnS*6E=G#C3=F(SHJ>&;tkFA@s6X4}6=R^bXrZFLs%ybm&Dyn3 z0#V#KyhO2_-d_=YCg;ofW9KprK9E{ZF0Wc&F<^7GKR?= zG!p}DO2yPVNHXC1fZMX;qW{H?Z#(j<9hcm<#pnRgt{+-#Rq~DA`FSUWW`WTZnn#?* z12;~e>e(;)kKs4&R^)cf`Ym4eD>1Uo+3GqQL~muc7zbC~kxp$t;w({h{1Lv7+tl5{ zew_NBke~g4OAY#;a;bbbe_BGVtE?bCbKpCIp<9UW3l}+n)=I^-WN$bRIW}9-JYF8E zb*e1@YojYquZRO!tg||Bg^B89M?+4p3wP!GyK_!DNoMza_nsV$JrO;P4{9#3P{EYY z_17*7%DFRRm4Y%RyK`Szq-cw$tz4CT9Ws^JdZMy83%Rg6;r9HQHxfOTt-FRUi&sHq zOK6?!f`?0e>^b45A6II88DV}=FsQ1&CZ4Lf``LK|-F2BpI>}Rq=_7>$t27)>27PSd znwT8$bLbM2E2Lp%VWCz(aoyB&k zgj%F5tUt&Rm%rv--Cf&~WVV|n=yC~kac!Ub-SZts7Ag*|T-!%X)_Hncx9ixK>ZFRV zsp!!RUB!&NmV39evlWBOdE#(k^EWqP2cZ$=R;Uk2qOpicH>z6a5q`AFSiZ&XVG( zzF_{CK2lj)-IPP(ZV`Wz19(AkX!5RBffc8@h0=ba19+EhJ#=Wr^BY|_takviAR1c}v3n6hWgU=dhk1d0^8$|$Z zZN3{v;GPSQ+8Q!67z)xRB+TwaYrZgJmSY)>YLt^0dnLt?F)nNa(qRs2rB0A9Hd#9O zA9IbiJ;ig%^2t*s>K*7au~|`q;*G@@_}(2VP|GdBN-b9TJCqn*u~1-tUVlfk?9A?6 zu5I*ckLVdf7~fo9X^#&XIt<#X*FVW*fkR*|& zv=x5aITw-KZ>HuN=~*8vY?J7{dYpDxKl=6Jg(*Re)_ZZNwjoR3r4fj88P9oIiLdXt zI&b&{bU>3Bq~klJ^t^xC5`BZf83I5b02RJ54?@=BW>~jg$N8*Dn`A&`-k8Ca*Xstb z`{9*Qs_M)lKrH0tKuM{JGDAu+JX}T=yFQSY!`PrJc&)usf>-y&gkSGr5VW#zV`h%d zNix5uxH{A9O}z90vtB)vH4&R5cRol$?RnqG+-JZ4kG=PfYbsmYhEY@$RBVXUsGuMy zNRytZ2nYyB4LvF%0wN$Sln@mGrA4}efJm>QcL=@r7JBbJln_D^zm4b2IWu!+=9%;T z&U>Ekdw=r>Wbf?kwfDZ)TKBs5U9O8e;r>sS#m_$fH&c#&Ymxs?fA`-v8V>w4-2V%o z{{fdT-MaeDiqR2o5WndFSK#s0Hd3?@cnk^h0801TAyUc{-GE7=CR?F=>ak=geqdNZx9<=O5q%-{VM zUJT}9bWipOyWSTGdSwoTIK&VBI?w9Av0sFIA9V#)ZNwJ!G~_(RL|`z6!^XC5{N^i0 z$)XGMlHR7irMB_#CxStcLA3wf!K#Yv$fCpFsR~a*qe`c#-C@#jFTFC{k>necc)s1} z*RdXcEDws>A^=bOf$bmqUPE`pK-b_(?U#+A zw$DSHDr#XIG>2?E0v>kW4p-KV0IdgG06tgNUgSETDh|k-@k3JF!_kz(jT8}-EZB4y zWvsdj9e#nVQbPkT#kN3RQ$-P^4nfV)dZ!g|v>XH+?9BDw{7r|Y{U9?io2-912$|r<0)8oRAa!uuAbLt?ORZ(kD=U$d zx_awZhvZ3Dn7>nzX+&Tox*(DTySI3>HD35TRS2T4{uhUU5ZSv}7~l$<1=6wJyh8dJ zcghQ`ve!j#pbPtdee9RwNOR_%MO629Q?zzietG2Af?{nibYVs|l}{P}T!KJ~%`c>u zH6QzVhyE{t{w2`AM(98E!LR1^e`3m;v$VSClohzRv}#i(Y3k-@iX62o8ydLSZi__7 z0#lVov%A=P4=^R;p8xzc;-*4J})814)D6weBy}rB7)}hR(woLHfOARcun&5h8CylHQ z6V=N@4RR^vf6#BWt9mEn%X^?FdH>QVtE)v5V>qDPcZ z-090GL6LmlPjMB)UNfw|#N)o~@1Q4eK9$;d?WOejzAXKrgfpw$>)u6|)Ov0F6KUA% zL2#_VdV8H9!Bp!$-yCw;tbZ4`a{cuCiU4`ziZ+Qi9Uld@G_z`LsB3vRGaKS{FC5WU z7eC~92x{kKad$*DIAh7AFod^&F~1~Z6thnM@p%S%Qq_~C8hj~GEEw^2&aSUk<}s9O z$4R`=h(@DQ-FdP|aWdGz@bmUL^SWGy`!5=o)1Q@C?Hc!=KQw;)nE;0S0`13(=x{(nx31o8GLm0UzOu2QrKR4~p59aQQ&p3ppiUa?Un#MkN%l0SEyP zw|*c5d^5_NolE+>BmvT{Y&$lxQRcer0`id(kExune5NC8c}v_l`*UYkFZH#f_bm6oHb4a4K(o1WAhXq&)08KEU#IMkA(xsGeOvYL7g zZLL|s2*wbjUdvqR%X4bwd^g4o@@t1{0;#B+18;B5eSx;Pabx*hWzr268DH!My!XFl zS;f8K@O9i;XkPG@bC4ldE$aL&Oom8gTsryl)7n$Ko(fM7xpNOy(tMr9HPlV=dSvKr zi${jmmj#a_-btrXxCj?B>H)NxU?2_)6hY31Wk{6lV_5uSgytmTI513gbI-uc5m+Rq4ZA^^Fb zmp$glNT5Ogva6*-ORpI;t zfQB20XekWQq*A$gj`eBM9Mh`8lo*?=?WibD1nse<`_eSP4SP9zu-dOqL7+}fzUibW z>$-FUO*`P;7*(+6jpWxWMA=Wb+a7OIoHc!FSv z=4+vE-S9y<`BA1rPucI(AEJiw6JlkG*712Zi6g1Ax2}paIeai~9GnWOtWU0NSUh11 zO>Ei?ZQ058bu948j;;mOGKuv6fEV%tBIp3T(3$Oes4;;3rN|!i1N%#pKjXW#w5q1?BlK6gD$RM%& z_g2=QzISP_u-ZMx4_Vp;+B2|U5Sbl20JK*MTwxB#ak9b2VdIs?<*T1T)-?9jZLSz# ztp7s~{r8Jw0rLUR;Ob)VDZ{IC{W}$@boWOx~zw29MNS{EQnX8c)$&Y@H-m5#J zz~JblHvQ7JVVrtV&{`runYZM!saO@Cy2Jf>H%5Xa= zYoG-E19#Y6qv2{3Q|aF3<+n!%by*IrXfbh7N4-s**VD@PiqkjCzf@aR9Adaq^no)_ zVVg5(q5T5-GNFBnKChk2>&5MjivfDGCa$~wG{h`---Ax=lEHRE?dFQU{5`wkO$ins zW?-}=Mx=&6ZpZryPNn2Z$7Wz1ByjN91vZ~ILG?^e1fB6r7injcz7@s|M-FUjCd&mg zi1BhFuB_MB%GWX<6XlZdi#hh{o{Lsaq_5r`#9O!F?URa{xBnP;$Gd8vrIt-}}kk8x;Mgi84}z@=E7Sy|b?LVF~@u~w;O z4h4A2V!HP=J`J3vDPPVAp1i{}{+2iYORvsG!|n?Qb~x=~-i?Uv=H~9Lj@_%jRH?;C zKch4Hx|oTJ-BVhg+Q;mfFHOoskFr;=A6RIU2o-5@1swi?|Cj+SUnkVAx&+qLpLLJC z|DB4Lgo6J{?V{2AmN8{HQGGL_l~R`Hd;&#zkDm@`moZ5ErwP6f*}9aF$8{?W?)2=j z>M^k6{hd|)#R z;+sZZ$T6D3Sv}C#+V@I~<>7@-h_foK7x1 z7MWH1YCbh6==Dh&&hQj1>4*}6mYadKT@QWofTUZYC0gDY*e~3R|LQ{=TjI@{`cCzz zhhm4&l63_fd%tor{>rH!-v@m}sZ1u#6H~l*p?~C}L#1U@lHE0SJGZqMHqd|UrUB_1 zy6+uH+>BjJA>i|WbxECVKl7CMG-s!m2X(+NOnVGg5Xj!i1CuG z=tQ5$EF3gp*8{H<4eh*HD5Ti?$qEnlo)wdeb9$Ch+E(MLgnNF`PsgVHR!1~BuyBz( zrm@*URzi3IHc-op*9OKxK$14aeUfZ@JKI9}>X!acuBUqW1Iy5h2VXU*%Cc%7KG$vW zgiD2i@p_j<%TfZtZ}RguctTI${Yb@P>9p%<^oLGq9CCq5#+f@hgJT)+k^DjS=N-Tu z{S6n_@30+9vD0`J)SGR5H~6gYlR%gGJV_8i*9gee`&vLzlKC5I935{hX6rnF$M_sc z@SaLaT)8sbz6eL|?tqHeP^9Z<;7|up41EPi;G5*$Pv@H)Bhn$Kb0H*In;tdb*Wg4i zAS2CYFQ9jnbO$HB@&#WY5d{=4eG71FKQ)|oQ||6uOoqBQk3qypcz3)?=tplO?@HNo z>cYU||0XD&^xUtkp50s3y1l~l4V6|3A`k5*CzlNmv z+gvDr#E|$mE|fokc>gE3Q2s`a{143g-{nFXae4pDo~<4ylxkrV^uVmFCrIIvdAzLJ zy~lEPreHrK4aWK_SWRPwr0RwFb0-uXCT?{gwqHyO^Lb-FxgJv0rvx4WE& z8vSkCgu*tR9I)(jqLFjkM2k5`H!pHql&GXo`0~IrvP5Qa!m!$2E4@s*N_a36u2v7K9|bb@ZLD zH~Z z`uZ`m4rS&;T^Bzdc_Vx@?#{&%2hC31cfbADSAR+3uQBo0RQP|T6b#MMc%;uMRc5i7 zxn^%YyndKhvP&#sf5Xcz&gRiVj1Db+^}ewzuyHWz%i{f5y`Qv~x$;65_gYJvus@IV z`5Aizqn?%|MBAB9vBc4*P1sd7W^YMBt1M&d(K$T9=#t-JN>{8}-=e16w&(svl)#BG z%Ln=0woJY3J7H1&uT}>i-(&859A6*%iHn|9OHM1&$oaOSu1K+OibUepyI_Gu7C$Ca zKt-Qv>xVOGZIin+vI#1~!GbA^^YV)HO_R>+h7sCwS}RD~1zp_-(E9p%NmtS;umI?+ z5-Og(!v(XcY-^cZPTSkK6#=c&a3GC)(1%1tm|_u(rB-{bC#TG4qVtJesq^tlZIUxz zO+Ov}8^!GJ{C_ApJ1J0{xn(MJ$AM|iql6I2hO92A-+eZNxC+g_2aYcg-W8}Q)yaeJg!#anlIk@~)JYsw|F9wP87B3F1LC9}0uW)D2fDq&Y? z2K-jSZ*kfocdbo2a|9GlZOSL5#GL1rm{ps%^1>?GcD%lIV_W~m-@a5^*?uH61aXYY zbA6CQtT1BZ0tB@woDw@U!N<&3P6~KtV}cb2+LZQNta2!(7gZV>8ND9a?c^gQPw<4aiRsC$HZx>P^r3^f1qFR}WjL@i6bh&5uF&@bjBCorJZ z-P@ZJofC)5dahpdOMhuT>q5AqC1P*C*}1#$w=d13k)qT_+FJFZNJ=-}3?pnF^s(-y z*FuQ<>>K}WI%Im+M>)&OT}gaqS4(5dD(4DI^_lY;?;F=jZ_1Mg-j8%8Da$eU#%x|C zyOf9NFbiGf^jlmOcL!67)m-A$B7%kU%Mkdl(V!wIbDn}itn;>UwK)H0<{~ZTyblVo z#JiHS)Fr;7*Zc;7kyc^dw&TCQT@#lz0`jsQEvRou2QKh{In|Wj*#vu5Ll4+fWR`D> z+^YG}{6}FU;&`b7Oyo%paN_}W>n5L>3#=j>m11u(ndS0#Z5BB6fu{Fw@*cC72sOn5 zJ8@Ryq|Xp=XYYiK>|jFUS-oy6?%#G+!|}j52h3jVmOLxBNJv?Xa~N#ys-6MPyv+== zsMG&GCfN`+*6+W!VEtCuoUA&Pz7li-3`LfX5of2i(aT_&{qDfM#fNwyr8iH_Ck{GU zZJvu~3i?!+-5fE|w+hY?UOGHpj&<@EHbE9&Rud);;@L9}Jc8B+;jELU^3#W9r(S73ryrDn>POQKx=|+C{q4fOVtUBTUyfnAHa5Ex zJa-awj$D^EBN7=se3uRFih=KIrl6p@jiteWQIS_uEJHN1qNOHyOs%ZkK+=s~GA*B}p5r6u8V1Xydzmc{mM09w?pfJwf>$C~A0Xp& z0quk|yIztV$u%Mf9$GKRp)a$0rofy>iN}RFd?DNm8c5)?-1+Xd+pPpVEx+F}feUjU zL2}W@>zpRyI=S3!jKs{g8XtXORAEL4{_~JTcmK{ufkjuo3+>*Pz)w#)*(|8i{9SDm z*C1Q*4;Zw0L5o|MxtL_)jBIN{W{MPZQzY1jz61o0yqCx1$vS z$Ts<9tX^2X(y)GPY(mR_&&0vZ*hF3aE-n9EGdtUdCa+|yEUm3xn!L256$7rJ{Mgol zR#0$%tgF-h!zu7vM0opR{xMAde?){wdtmD?BE0{ri17XyHaqyU5ZWJaN451$5*K;( zxvvx!eSRx{?!Ms&>NeGM^kbNeD|R^F2m3Vko2xwGz-0U31?xq%H0Rn+UWlQAYgY$e z?`9{4-XX6`vU*HGUc#Z(%>3`Ra=S|<6ZQM`_$4nJ>1;I%%%F-m`)aA&v^rr9ByDmu zNpwo)3O?|xOy8$av#+Qfjk9C|fvFwGM=v$!y~{YsSPeS|KQdoGc|tt2r@Ue9ar01< z%VS)n7}xOw_^&4g|FkUrKz?X*XSvvTI^$%W5k0z3q449Kg2#+GE(HfP>@f6_F05<7 zcP7tA#1`5Zm$a9fEM{_D7wV5X$6o8<=3(y49+I+PtkSbJBw0dr(8nnT>PvO=X56Dk z4{2s*81Mytm}el`r8A8-6V0iSCmrURU}Kfh57Qj!E3@Vg!y!K3sZ25K1ZM^2F%{(} zy-sJ%)=C4`n5yUIkM0N`m=@OZS&~epE3&jopUp3RzY@swF*at4$BDt{H$CHj^Oz@I z7ApRB?V0gwsz$Dkht$u}O6$=T->E?Gkl35gHFWfWA~;wQz_*ctGECnL*wQds8b=T4 z-_Ln&igQQ@$1QnU=--*HQMKpa=||iOr`qLL?B>sNnNQ12mGo(DR5`-}Z}A_yEsk~~ zIPD3*-E}ofaQ0o#L*LB5Bf2TDIxnAR(| z_F~Rhn7o-Bs2;A8tZda`;YxG*!~K#n@}KMY7J#r?y6g8U{bvyteT; z#%=A}HnLkM>Tk2=X%}OO-ZdaO==1 zgB&YBP6@9OV@VC5bl1H!^Mb+F%~A6(^$`Qi5e0oNcJ~gKlRP2tZ6OJN=-X_Fe5LiR zGm$i`nqLx#>1Vb8*6jP{iOMkLiwjd?`ztxGl$XM_0r1M|hCU#LqF!+2mup*x%8 zwd{EoneJB}dV1Qhz8Mq}3>_O4C(w;sF{CyXJK|+=(*|b!J`kPtho}mW^EyAXTQlWa zmrv_NYO~?Xeq^(tTe}$P!U|-k50OMYx5#Tz6LlWzDqwvHcO(FjD0$}W%BgJLsl7C~ zRYiF1KXL*7RgqnLetzci4>|BlUo2>|wX2;>fOg zh_A@|7n5oPE|Bdf)1kJ-{9awTnE`A^I61QKy4EXzxhm@wBRX4bCT{ zO3xSbzvaTQ4U3icfp#7=Hx0#HvKc;N*}8g=NV`KsCk*H>6x*l!1uHS6=PDgnORBi{ z@Zg&*Ds8SB$mv9nQZ7XD`8yKmu$Sp1^#WRfv7S9@4RLPS#$~_f52G%pJ@1AFM~1bu zMo&6=TDr)2yUN6$GZy@7n#fs0a7d~^Mk@E$Uj`LeF zk@c1?fKyFbC^xX7Hgb46i;FX8xG&UWQrh@-SWD+NON2~htuZnLRlEgxk7kcD@7|e; zA*%4dHfsupo9Bn*l_z6DQ*q!rtP~`6Y5k~u8DH~5U+eR;hl4;5{CS&5YI~vt8eQWn zA$g-HfyS#%wwcj(hzfd>TH8@J!dmjs(J5>$fnW0x(^s%Y^eb_7w@>H?9hB>Ed99<6 zT(?>dcNYkxS?*NX8LHnOZWDY+P4HCD7GcoA=lKka*=%Sw|qT<3sw4 z)r2>3Bo^coV?{nvx~W5sq+DKTo0PsvH;;`SG~_r!Ji;>;kC?KsJv@9&I<2#v&NmI1 ztdE!F8E)9KpH-F=+O$$9m`?c#7YvfP(o>R}C3G7v(-Y3Th3L)j-I&b}uC+NAa9<+M zTN71*T}0f{>C(K$!yB`hCV1i?gg2>j3*B9UqMK*DQ9LEnHDd+M^+PT4q!K_?8E#f5 zo+na<0i2G>=hT~$G+o|g0aa>(7I@7wSM591O93QK1XwiS&PQ4#P6jctb+rkQLSI7! zru=5C17ankykj}B%JC#=oPW}3psn3CRAb@mfw%p)(kk`?OQHObMBnpjnPif8YpruhBz%otn=1FvsZ< z+JAtww6w@gT~#UqD0q$s_CeY~YJCPL)y+ziD1|5>2Mxu%jXVjLtd3%)-lMtKy?o)g z)!V~Rj*+aO-M($pCFaqvZ&g=%^XT-2A&zaq*Q~xuMOOux@n%n`XmQ3KcGzP+2MvR} zPr*EMr%k$L>yjalnRjjGMk1mcm^VRZNM^XJNq#4zrx&cF5)KsKl@={fO`_VvUNJy0 zBdq2T=V{(;N*P9TU0%BZ0d9tRe#O?HTbLJ?YrfNm4*(ULSkr-I*%XM-07Ai3dhYRc zxKYD!=G2Pl?G)1i8cxK0BGesbG-A$p!v2==ZeP+)UCroP3Z!QGU zc;(`hdl30Bhqbj`W(o(6FKbdV8lsB{UwG4$%6b`D9Q%eq+W=su$8&KxwCBVto_i9b za_vgz^S3}ff({UFJun29oHTu3dQ+V{!)@S6EXbtzN)J6=Rf}{|I(0~j3x}#MuHtkK z$`5jY^NP3J+Y_4ttvx9>{8l_aW!~8-i58m60bi(iCK?>u%!lhF5h2AC(-CWB*7dl| z`+@hy1&eR;RO}qGKSv}a-o}CRCU&Wx6GK{)Xe1f3-|fXTMb|)+;mE1hEwF(z{_}~Z z<3~aVe66+zUmYXKPZ-5tZwoG$rhiwz4!-4C_$-CLh}^DPv$L1_oNr~ipv3dq_{Yan z0W!Swcj=Fm&ee=jmxBl%tnFWAOeGH znqFR8zP4nTSGqo26{O$HnkCj$3Md(u(}wHGwBfbGyHYB2r{B5ejZc!g`i!mK1=h!W@?P$8{AcaI{z-P?xRBzCao{!LRY)3nFUC_`oVBE#$JrssvA)@n)hr zK^jM+a;SYmTCHNuiZMERZ%U-mJp5{EoVYUdL3x%Hy?M7! zuHEZG9}^8X@vD@6kjXK($#93nG~u{6%G!w{7uEvPql0GFh(6{U<^g2l@OqRjz)sQT z6HUYcX4;2jjmq*1s8zV597yQ`h7e9gYO1wZV84Lf4v6Hq*UMj@o*ownq=a6?F8D=# zer$ru(it3@N(?%axWVx9HAKp;{cZGV5w-*n-DnL7DrJ z=c^D;D0KH;teg>T`A)Tp)*#UCR@Y#dT$NLK0X$eUYj#Md=Ti>yX2W-?*OR5I_|;kB zL}F=_{)eG81>R6&Lipz5Y=E8y6kN?eF0p;%`qLh$N$MUR_5n}t?rIqeFIx+f57fTp z&+sleGds#Zw)?GkfmTXIh;rQ z@;kCWY}nujhA#&f<=}0rYPO~2!;hMn?(@h2naXzKSr8qMnPMsM)yDH0AJh_&rdRBd zH?T{akXvp%1QpmJ!Va`!#6adyeEI;!|NQR9IZV@S@HqAg4CXcnrnhLNe}FSd(~rxv z$~Meoeg9?V82;paVk7J#_VMPMpQjGO9#U$-X9dSBz`TV%JJ#s_8=rP;JwKwp_Cvmvu65RX5`9b9l zv!!AK_Tw!QE)zC{K;E%Z<yF